Browse Source

move files to src/ directory

master
wes 9 years ago
parent
commit
28abba8922
  1. 34
      src/archive.py
  2. 62
      src/database.py
  3. 14
      src/goasearch.py
  4. 24
      src/openlibrary.py
  5. 153
      src/predictions.py
  6. 237
      src/search.py
  7. 24
      src/textbookExceptions.py
  8. 97
      src/visualize.py
  9. 148
      src/website.py

34
src/archive.py

@ -0,0 +1,34 @@
#! /usr/bin/python2
from urllib import quote
from json import loads, dumps
import requests as req
searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw"
def searchIA(title, author):
"""
Do a search on The Internet Archive for a book
"""
print "running a search"
requrl = searchUrl.format(quote(title + " " + author))
try:
results = loads(req.get(requrl).text[9:][0:-1])
except ValueError:
return []
rownum = results["responseHeader"]["params"]["rows"]
if rownum < 1:
print "Couldn't find results for %s %s" % (title, author)
return []
docs = results["response"]["docs"]
urls = []
for result in results["response"]["docs"][0:3]:
urls.append("https://archive.org/details/%s" % result["identifier"])
return urls
# Example, search for David Hume's Enquiry Concerning Human Understanding
#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"):
#print url

62
src/database.py

@ -0,0 +1,62 @@
#! /usr/bin/python2
from sys import argv
from hashlib import sha1
def truncate(docid):
"""
Truncate a document id to 12 digits
The document ID should be based on a
hash of unique identifiers
"""
return int(str(docid)[0:12])
def createResource(textbookInfo, course, dept, coursecode, docid):
"""
Create a document associated with a course
This document contains any/all resources associated
with that course
example,
{
'books': [],
'dept': 'COLLAB',
'code': '2C03',
'sections': [
{
'prof': 'Lisa Pender',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Mo'
},
{
'prof': 'Staff',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Th'
}
],
'title': 'COLLAB 2C03 - Sociology I'
}
"""
textbooks = textbookInfo(dept.strip(), coursecode.strip())
# We truncate the id so we can have nicer looking URLs
# Since the id will be used to point to the resource page for that course
_id = str(truncate(docid))
fields = {
"_id" : _id,
"textbooks" : textbooks,
"coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()),
"courseinfo" : course
#"Syllabus" : "blah"
}
try:
revisions = list(localdb.revisions(_id))
if not revisions:
return localdb.save(fields)
else:
rev = dict(revisions[0])["_rev"]
fields["_rev"] = rev
return localdb.save(fields)
except ResourceConflict:
print "Resource for %s already exists, not creating a new one" % (docid)

14
src/goasearch.py

@ -0,0 +1,14 @@
#! /usr/bin/python2
# predictive data
# switch to elasticsearch's prediction
import database
import predictions
class GOASearch(object):
def __init__(self):
return self

24
src/openlibrary.py

@ -0,0 +1,24 @@
#! /usr/bin/python2
from urllib import quote
from json import loads, dumps
import requests as req
#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s"
searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s'
def bookUrls(title, author):
print title, author
if ":" in title:
title = title.split(":")[0]
requrl = searchurl % (quote(author), quote(title))
results = loads(req.get(requrl).text)
for result in results["docs"][0:2]:
if result.has_key("edition_key"):
yield "https://openlibrary.org/books/%s" % result["edition_key"][0]
# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle'
#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"):
#print book

153
src/predictions.py

@ -0,0 +1,153 @@
##! /usr/bin/python2
from itertools import groupby, chain
from sys import stdout
from functools import partial
from json import dumps
def gensymer():
n = [0]
def inner():
result = str(n[0])
n[0] += 1
return result
return inner
gensym = gensymer()
def printTrie(graph, prev, trie, weight):
new_node = str(gensym())
graph.node(new_node, "%s" % trie.letter)
graph.edge(prev, new_node, label="%.2f" % weight)
if not trie.children:
return
for child, weight in zip(trie.children, trie.ws):
printTrie(graph, new_node, child, weight)
class Trie(object):
def __init__(self, letter, children, ws):
self.letter = letter
self.children = children
self.ws = ws
def probweight(suffixes):
weights = [float(s["value"]) for s in suffixes]
s = float(sum(weights))
ws = [w/s for w in weights]
return ws
def buildtrie(trie, suffixes):
"""
Build a trie, also known as a prefix tree, of all the possible completions
"""
trie.children = []
for letter, suffs in suffixes:
ped = partition(suffs)
if any(map(lambda p: p[0], ped)):
# check if there are any children
trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs)))
else:
# we've reached the end of this word so just include the final letter
# [1] = there is a probability of 1 of reaching this single leaf node,
# since it is the only possible completion here
trie.children.append(Trie(letter, [], [1]))
return trie
def keyf(x):
if not x["key"]:
return ""
return x["key"][0]
def tails(words):
for word in words:
yield {
"key" : word["key"][1:],
"value" : word["value"]
}
def partition(words):
"""
Partition the words into different prefixes based on the first character
"""
groups = [
(g[0], list(tails(g[1])))
for g in groupby(
sorted(words, key=keyf),
key=keyf)
]
return groups
def flatten_helper(letter, trie):
return ([letter + child.letter for
child in trie.children], trie.children)
def flatten(trie):
if not trie.children:
return trie.letter
prefixes, suffixes = flatten_helper(trie.letter, trie)
return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)]
def flattenlist(xs):
locs = []
for x in xs:
if not isinstance(x, list):
locs.append(x)
else:
locs.extend(flattenlist(x))
return locs
def matchc(trie, prefix):
c = None
if len(prefix) > 1:
c = prefix[0]
else:
c = prefix
return [ch for ch in trie.children if ch.letter == c]
def match(trie, word):
if not word:
return []
m = matchc(trie, word[0])
if not m:
return []
else:
return [m[0]] + match(m[0], word[1:])
def complete(trie, word):
m = match(trie, word)
if len(word) != len(m):
return False
completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))]
if len(completions) > 10:
return dumps(completions[0:10])
return dumps(completions)
def sortTrie(trie):
"""
Sort the children of each node in descending order
of the probability that each child would be the completion
of whatever that word is
"""
if not trie.children:
return
sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True)
trie.children = [x[0] for x in sortedChilds]
trie.ws = [x[1] for x in sortedChilds]
for child in trie.children:
sortTrie(child)
def toTrie(words):
for word in words:
word["key"] = word["key"].lower()
trie = buildtrie(Trie("", [], [1]), partition(words))
trie.ws = [1]*len(trie.children)
sortTrie(trie)
return trie
def testkey(w):
return {
"key" : w,
"value" : "1"
}

237
src/search.py

@ -0,0 +1,237 @@
#! /usr/bin/python2
import elasticsearch
from elasticsearch_dsl import FacetedSearch, Search, Q
from elasticsearch_dsl.aggs import Terms, DateHistogram
from sys import exit, stderr
from json import dumps, loads
from itertools import chain, imap
from hashlib import sha1
from textbookExceptions import UnIndexable
from mcmaster.classes import allCourses
# Generic instance of elasticsearch right now
es = elasticsearch.Elasticsearch()
def summarize(text):
splitted = text.split(" ")
if len(splitted) > 4:
return " ".join(splitted[0:4]) + ".."
return text
def sectionToJSON(section):
return {
"prof" : section.prof,
"sem" : section.sem,
"day" : section.day
}
def classToJSON(clss):
return {
"title" : clss.title,
"sections" : map(sectionToJSON, clss.sections),
"dept" : clss.dept,
"code" : clss.code,
"books" : list(clss.books) if clss.books else []
}
def truncate(docid):
"""
Truncate a document id to 12 digits
The document ID should be based on a
hash of unique identifiers
"""
return int(str(docid)[0:12])
def hashsec(course):
"""
Hash a course into a usable id
"""
if not course["code"]:
code = ""
else:
code = course["code"]
if not course["title"]:
title = ""
else:
title = course["title"]
if not course["sections"] or len(course["sections"]) < 1:
course["sections"][0] = ""
if not (code or title):
raise UnIndexable(course)
h = sha1()
h.update(code + title + course["sections"][0]["sem"])
return int(h.hexdigest(), 16)
def createIndex(name):
"""
This creates a new index in elasticsearch
An index is like a schema in a regular database
Create an elasticsearch index
"""
indices = elasticsearch.client.IndicesClient(es)
print indices.create(name)
with open("./course.json", "r") as mapping:
print indices.put_mapping("course", loads(mapping.read()), name)
def indexListing(course):
"""
Index a specific course in the database (using the courses index)
example,
{
'books': [],
'dept': 'COLLAB',
'code': '2C03',
'sections': [
{
'prof': 'Lisa Pender',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Mo'
},
{
'prof': 'Staff',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Th'
}
],
'title': 'COLLAB 2C03 - Sociology I'
}
"""
courseID = hashsec(course)
print es.index(index="oersearch",
doc_type="course",
id=courseID,
body=course)
# For every course we index, we also create a resource for it
# This should be an idempotent operation because we're putting it in couchdb
# And we're using the id obtained from the hash function, so it should just update the document
# no need to delete anything
#try:
#courseDept = course[0]["title"].strip().split(" ")[0].strip()
#courseCode = course[0]["title"].strip().split(" ")[1].strip()
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)
#except:
#print "Couldn't create the resource associated with %s" % course
def termSearch(field):
"""
Make a term search (exact match)
"""
def t(term):
q = Q("term",
**{
"sections."+field : term
})
return q
return t
def search(field):
"""
Make a match search
"""
def s(term):
q = Q("match",
**{
field : term
})
return q
return s
def join(x, y):
"""
Join two queries
"""
return x & y
def filterSections(secs):
"""
Get rid of tutorial sections
because they almost always have "Staff" as the instructor
This is just a heuristic of course
"""
filtered = [s for s in secs.sections if "Staff" not in s.prof]
if len(filtered) > 0:
return filtered
return False
def searchTerms(terms):
"""
Run a search for courses
"""
# A list of all the queries we want to run
qs = [searchers[field](term) for
field, term in
terms.iteritems() if
term and searchers.has_key(field)]
if not qs:
# No queries = no results
return dumps([])
# Reduce joins all of the queries into one query
# It will search for the conjunction of all of them
# So that means it cares about each query equally
q = reduce(join, qs)
s = (Search(using=es, index="oersearch")
.query(q))[0:100] # only return up to 100 results for now
results = s.execute()
filtered = [
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials
for secs in results
if filterSections(secs)
]
results = []
for obj, secs in filtered:
# Add the truncated course id
# This is used to point to the resource page for that course
secs["id"] = truncate(obj.meta.id)
secs["title"] = obj.title
if obj["dept"] not in secs["title"]:
secs["dept"] = obj.dept
if obj.books:
secs["books"] = [
{
"booktitle" : summarize(book[0].encode("ASCII")),
"bookauthor" : book[1].encode("ASCII"),
"bookprice" : book[2].encode("ASCII")
}
for book in obj.books
]
else:
secs["books"] = ""
results.append(secs)
return dumps(results)
searchers = {
"title" : search("title"),
"loc" : search("loc"),
"time" : search("time"),
"prof" : search("prof"),
"day" : search("day"),
}
#print searchTerms({"title" : "PHILOS"})
#for c in imap(classToJSON, allCourses()):
#try:
#print indexListing(c)
#except UnIndexable as e:

24
src/textbookExceptions.py

@ -0,0 +1,24 @@
#! /usr/bin/python2
class UnIndexable(Exception):
def __init__(self, course):
self.course = course
@property
def reason(self):
course = self.course
if not course["code"] and not course["title"]:
message = "there was no course code and no title defined"
if not course["code"]:
message = "there was no course code defined"
if not course["title"]:
message = "there was no course title defined"
if not course["sections"]:
message = "there were no sections defined"
return """
There was a problem with indexing this course.
%s
There could be several reasons why, my best guess is that %s
We need at least the course code, title, and one or more sections to index
""" % (course, message)

97
src/visualize.py

@ -0,0 +1,97 @@
#! /usr/bin/python2
from json import loads, load
from re import sub, split
from itertools import groupby
from numpy import mean
from operator import attrgetter
import pygal
import csv
class Textbook(object):
def __init__(self, dept, code, title, author, price):
self.dept = dept
self.code = code
self.title = title
self.author = author
self.price = float(price)
def __repr__(self):
return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept,
self.code,
self.title,
self.author,
self.price)
def courses():
with open("./books.csv", "r") as books:
booksreader = csv.reader(books)
for row in booksreader:
yield row
def groupDept(courselist):
sortedCourses = sorted(courselist, key=attrgetter("dept"))
for course in groupby(sortedCourses, attrgetter("dept")):
yield course[0], list(course[1])
def meanPrice(books):
return mean([book.price for book in books])
# Questions,
# mean cost per department
# mean cost per faculty
# mean difference between book store copies and other copies per dept and faculty
# number of overlapping books per faculty, do eng students benefit from that?
# maybe a survey for students to see how often they buy books from other sources
# correlate with how much they could be saving?
facultyDesc = {
"hum" : "Humanities",
"bus" : "Business",
"hlth" : "Health Science",
"eng" : "Engineering",
"sci" : "Science",
"socsci" : "Social Sciences",
"artsci" : "Arts & Sciences",
"meld" : "MELD"
}
faculties = load(open("./faculties.json"))
def categorize(dept):
# faculties
return facultyDesc.get(faculties.get(dept, False), False)
def byFaculty():
for dept, books in groupDept(courses()):
yield (categorize(dept), dept, books)
def meanFacultyCosts():
byfac = list(byFaculty())
graph = pygal.Bar()
graph.title = "Mean textbook cost by faculty"
sortedFacs = sorted(byfac, key=lambda x: x[0])
for fac in groupby(sortedFacs, lambda x: x[0]):
graph.add(fac[0], meanPrice(list(fac[1])[0][2]))
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None"
return graph.render(transpose=True)
def meanCosts():
cs = groupDept(courses())
graph = pygal.Bar()
graph.title = "Mean textbook cost by department"
for c in cs:
dept, books = c
graph.add(dept, meanPrice(books))
#graph.render_to_file("./test_graph.svg")
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None"
return graph.render_table(style=True, transpose=True)
for x in courses():
print x
#print meanCosts()
#print meanFacultyCosts()

148
src/website.py

@ -0,0 +1,148 @@
#! /usr/bin/python2
from functools import partial
from couchdb import ResourceConflict
from flask import Flask, render_template, flash, request, send_from_directory
from flask_bootstrap import Bootstrap
from flask_appconfig import AppConfig
from urllib import unquote
from search import searchTerms
from openlibrary import bookUrls
from archive import searchIA
from urllib import quote, unquote
from json import dumps, loads
from werkzeug.contrib.cache import MemcachedCache
cache = MemcachedCache(['127.0.0.1:11211'])
import os
def predict(fieldtype, term):
print fieldtype
print term
if not term:
return "[]"
else:
try:
cs = completers[fieldtype](term.lower())
except KeyError:
return "[]"
if cs:
return cs
return "[]"
def predictor(fieldtype):
def inner(request):
params = dict(request.args.items())
return predict(fieldtype, params["term"])
return inner
def cacheit(key, thunk):
"""
Tries to find a cached version of ``key''
If there is no cached version then it will
evaluate thunk (which must be a generator)
and cache that, then return the result
"""
cached = cache.get(quote(key))
if cached is None:
result = list(thunk())
cache.set(quote(key), result)
return result
return cached
def ClassSearch(configfile=None):
defaults = {"Day", "Building", "Exact Location", "Department"}
app = Flask(__name__)
AppConfig(app, configfile) # Flask-Appconfig is not necessary, but
# highly recommend =)
# https://github.com/mbr/flask-appconfig
Bootstrap(app)
app.config["scripts"] = "/home/wes/MGOAL/scripts"
app.config["styles"] = "/home/wes/MGOAL/styles"
@app.route('/favicon.ico')
def favicon():
return send_from_directory("/srv/http/goal/favicon.ico",
'favicon.ico', mimetype='image/vnd.microsoft.icon')
@app.route("/buildpred", methods=("GET", "POST"))
def buildpred():
return predictbuild(request)
@app.route("/locpred", methods=("GET", "POST"))
def locpred():
return predictloc(request)
@app.route("/daypred", methods=("GET", "POST"))
def daypred():
return predictday(request)
@app.route("/deptpred", methods=("GET", "POST"))
def deptpred():
return predictdept(request)
@app.route("/titlepred", methods=("GET", "POST"))
def titlepred():
return predicttitle(request)
@app.route("/", methods=("GET", "POST"))
def index():
return render_template("search.html")
@app.route("/fc", methods=("GET", "POST"))
def fc():
""" Filter Courses """
print "trying to get courses"
params = dict(request.args.items())
for key, val in params.iteritems():
if val in defaults:
del params[key]
results = searchTerms(params)
return results
@app.route("/resources", methods=("GET", "POST"))
def resources():
""" Get Resources """
notRequired = False
params = loads(dict(request.args.items())["data"])
print params
author = params["author"]
title = params["title"]
if ("No Textbooks" in title or
"No Adoption" in title):
return dumps("false")
# Cache the result of the open library search
openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author))
print openlib
# cache the result of an internet archive search
iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author))
print iarchive
if not (any(openlib) or any(iarchive)):
# We literally could not find ANYTHING
return dumps("false")
return dumps({
"iarchive" : iarchive,
"openlib" : openlib
})
@app.route("/scripts/<filename>")
def send_script(filename):
return send_from_directory(app.config["scripts"], filename)
@app.route("/styles/<filename>")
def send_style(filename):
return send_from_directory(app.config["styles"], filename)
return app
if __name__ == "__main__":
ClassSearch().run(port=8001, debug=True)
Loading…
Cancel
Save