From 28abba89226c9a5e1747d4a65a3bc5c8a730790b Mon Sep 17 00:00:00 2001 From: wes Date: Mon, 20 Jun 2016 04:15:41 -0400 Subject: [PATCH] move files to src/ directory --- src/archive.py | 34 ++++++ src/database.py | 62 ++++++++++ src/goasearch.py | 14 +++ src/openlibrary.py | 24 ++++ src/predictions.py | 153 ++++++++++++++++++++++++ src/search.py | 237 ++++++++++++++++++++++++++++++++++++++ src/textbookExceptions.py | 24 ++++ src/visualize.py | 97 ++++++++++++++++ src/website.py | 148 ++++++++++++++++++++++++ 9 files changed, 793 insertions(+) create mode 100755 src/archive.py create mode 100755 src/database.py create mode 100755 src/goasearch.py create mode 100755 src/openlibrary.py create mode 100755 src/predictions.py create mode 100755 src/search.py create mode 100644 src/textbookExceptions.py create mode 100755 src/visualize.py create mode 100755 src/website.py diff --git a/src/archive.py b/src/archive.py new file mode 100755 index 0000000..73fcde7 --- /dev/null +++ b/src/archive.py @@ -0,0 +1,34 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" + +def searchIA(title, author): + """ + Do a search on The Internet Archive for a book + """ + print "running a search" + requrl = searchUrl.format(quote(title + " " + author)) + try: + results = loads(req.get(requrl).text[9:][0:-1]) + except ValueError: + return [] + + rownum = results["responseHeader"]["params"]["rows"] + if rownum < 1: + print "Couldn't find results for %s %s" % (title, author) + return [] + docs = results["response"]["docs"] + urls = [] + for result in results["response"]["docs"][0:3]: + urls.append("https://archive.org/details/%s" % result["identifier"]) + return urls + + +# Example, search for David Hume's Enquiry Concerning Human Understanding +#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): + #print url diff --git a/src/database.py b/src/database.py new file mode 100755 index 0000000..a19272c --- /dev/null +++ b/src/database.py @@ -0,0 +1,62 @@ +#! /usr/bin/python2 + +from sys import argv +from hashlib import sha1 + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def createResource(textbookInfo, course, dept, coursecode, docid): + """ + Create a document associated with a course + This document contains any/all resources associated + with that course + + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + """ + textbooks = textbookInfo(dept.strip(), coursecode.strip()) + + # We truncate the id so we can have nicer looking URLs + # Since the id will be used to point to the resource page for that course + _id = str(truncate(docid)) + + fields = { + "_id" : _id, + "textbooks" : textbooks, + "coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), + "courseinfo" : course + #"Syllabus" : "blah" + } + try: + revisions = list(localdb.revisions(_id)) + if not revisions: + return localdb.save(fields) + else: + rev = dict(revisions[0])["_rev"] + fields["_rev"] = rev + return localdb.save(fields) + except ResourceConflict: + print "Resource for %s already exists, not creating a new one" % (docid) diff --git a/src/goasearch.py b/src/goasearch.py new file mode 100755 index 0000000..3dca7eb --- /dev/null +++ b/src/goasearch.py @@ -0,0 +1,14 @@ +#! /usr/bin/python2 + +# predictive data +# switch to elasticsearch's prediction + + + +import database +import predictions + +class GOASearch(object): + def __init__(self): + return self + diff --git a/src/openlibrary.py b/src/openlibrary.py new file mode 100755 index 0000000..d558c21 --- /dev/null +++ b/src/openlibrary.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" +searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' + +def bookUrls(title, author): + print title, author + if ":" in title: + title = title.split(":")[0] + requrl = searchurl % (quote(author), quote(title)) + results = loads(req.get(requrl).text) + for result in results["docs"][0:2]: + if result.has_key("edition_key"): + yield "https://openlibrary.org/books/%s" % result["edition_key"][0] + +# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' + +#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): + #print book diff --git a/src/predictions.py b/src/predictions.py new file mode 100755 index 0000000..b770a0b --- /dev/null +++ b/src/predictions.py @@ -0,0 +1,153 @@ +##! /usr/bin/python2 +from itertools import groupby, chain +from sys import stdout +from functools import partial +from json import dumps + +def gensymer(): + n = [0] + def inner(): + result = str(n[0]) + n[0] += 1 + return result + return inner + +gensym = gensymer() + +def printTrie(graph, prev, trie, weight): + new_node = str(gensym()) + graph.node(new_node, "%s" % trie.letter) + graph.edge(prev, new_node, label="%.2f" % weight) + if not trie.children: + return + for child, weight in zip(trie.children, trie.ws): + printTrie(graph, new_node, child, weight) + + +class Trie(object): + def __init__(self, letter, children, ws): + self.letter = letter + self.children = children + self.ws = ws + +def probweight(suffixes): + weights = [float(s["value"]) for s in suffixes] + s = float(sum(weights)) + ws = [w/s for w in weights] + return ws + +def buildtrie(trie, suffixes): + """ + Build a trie, also known as a prefix tree, of all the possible completions + """ + trie.children = [] + for letter, suffs in suffixes: + ped = partition(suffs) + if any(map(lambda p: p[0], ped)): + # check if there are any children + trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) + else: + # we've reached the end of this word so just include the final letter + # [1] = there is a probability of 1 of reaching this single leaf node, + # since it is the only possible completion here + trie.children.append(Trie(letter, [], [1])) + return trie + + +def keyf(x): + if not x["key"]: + return "" + return x["key"][0] + +def tails(words): + for word in words: + yield { + "key" : word["key"][1:], + "value" : word["value"] + } + +def partition(words): + """ + Partition the words into different prefixes based on the first character + """ + groups = [ + (g[0], list(tails(g[1]))) + for g in groupby( + sorted(words, key=keyf), + key=keyf) + ] + return groups + + +def flatten_helper(letter, trie): + return ([letter + child.letter for + child in trie.children], trie.children) + +def flatten(trie): + if not trie.children: + return trie.letter + prefixes, suffixes = flatten_helper(trie.letter, trie) + return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] + +def flattenlist(xs): + locs = [] + for x in xs: + if not isinstance(x, list): + locs.append(x) + else: + locs.extend(flattenlist(x)) + return locs + +def matchc(trie, prefix): + c = None + if len(prefix) > 1: + c = prefix[0] + else: + c = prefix + return [ch for ch in trie.children if ch.letter == c] + +def match(trie, word): + if not word: + return [] + m = matchc(trie, word[0]) + if not m: + return [] + else: + return [m[0]] + match(m[0], word[1:]) + +def complete(trie, word): + m = match(trie, word) + if len(word) != len(m): + return False + completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] + if len(completions) > 10: + return dumps(completions[0:10]) + return dumps(completions) + +def sortTrie(trie): + """ + Sort the children of each node in descending order + of the probability that each child would be the completion + of whatever that word is + """ + if not trie.children: + return + sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) + trie.children = [x[0] for x in sortedChilds] + trie.ws = [x[1] for x in sortedChilds] + for child in trie.children: + sortTrie(child) + +def toTrie(words): + for word in words: + word["key"] = word["key"].lower() + trie = buildtrie(Trie("", [], [1]), partition(words)) + trie.ws = [1]*len(trie.children) + sortTrie(trie) + return trie + +def testkey(w): + return { + "key" : w, + "value" : "1" + } diff --git a/src/search.py b/src/search.py new file mode 100755 index 0000000..777222f --- /dev/null +++ b/src/search.py @@ -0,0 +1,237 @@ +#! /usr/bin/python2 + +import elasticsearch + +from elasticsearch_dsl import FacetedSearch, Search, Q +from elasticsearch_dsl.aggs import Terms, DateHistogram +from sys import exit, stderr +from json import dumps, loads +from itertools import chain, imap + +from hashlib import sha1 + +from textbookExceptions import UnIndexable + +from mcmaster.classes import allCourses + +# Generic instance of elasticsearch right now +es = elasticsearch.Elasticsearch() + +def summarize(text): + splitted = text.split(" ") + if len(splitted) > 4: + return " ".join(splitted[0:4]) + ".." + return text + +def sectionToJSON(section): + return { + "prof" : section.prof, + "sem" : section.sem, + "day" : section.day + } + +def classToJSON(clss): + return { + "title" : clss.title, + "sections" : map(sectionToJSON, clss.sections), + "dept" : clss.dept, + "code" : clss.code, + "books" : list(clss.books) if clss.books else [] + } + + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def hashsec(course): + """ + Hash a course into a usable id + """ + if not course["code"]: + code = "" + else: + code = course["code"] + if not course["title"]: + title = "" + else: + title = course["title"] + + if not course["sections"] or len(course["sections"]) < 1: + course["sections"][0] = "" + + if not (code or title): + raise UnIndexable(course) + + h = sha1() + h.update(code + title + course["sections"][0]["sem"]) + return int(h.hexdigest(), 16) + +def createIndex(name): + """ + This creates a new index in elasticsearch + An index is like a schema in a regular database + Create an elasticsearch index + + """ + indices = elasticsearch.client.IndicesClient(es) + + print indices.create(name) + with open("./course.json", "r") as mapping: + print indices.put_mapping("course", loads(mapping.read()), name) + +def indexListing(course): + """ + Index a specific course in the database (using the courses index) + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + + """ + courseID = hashsec(course) + print es.index(index="oersearch", + doc_type="course", + id=courseID, + body=course) + + # For every course we index, we also create a resource for it + # This should be an idempotent operation because we're putting it in couchdb + # And we're using the id obtained from the hash function, so it should just update the document + # no need to delete anything + #try: + #courseDept = course[0]["title"].strip().split(" ")[0].strip() + #courseCode = course[0]["title"].strip().split(" ")[1].strip() + #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) + #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) + #except: + #print "Couldn't create the resource associated with %s" % course + +def termSearch(field): + """ + Make a term search (exact match) + """ + def t(term): + q = Q("term", + **{ + "sections."+field : term + }) + return q + return t + +def search(field): + """ + Make a match search + """ + def s(term): + q = Q("match", + **{ + field : term + }) + return q + return s + +def join(x, y): + """ + Join two queries + """ + return x & y + +def filterSections(secs): + """ + Get rid of tutorial sections + because they almost always have "Staff" as the instructor + This is just a heuristic of course + """ + filtered = [s for s in secs.sections if "Staff" not in s.prof] + if len(filtered) > 0: + return filtered + return False + +def searchTerms(terms): + """ + Run a search for courses + """ + + # A list of all the queries we want to run + qs = [searchers[field](term) for + field, term in + terms.iteritems() if + term and searchers.has_key(field)] + + if not qs: + # No queries = no results + return dumps([]) + + # Reduce joins all of the queries into one query + # It will search for the conjunction of all of them + # So that means it cares about each query equally + q = reduce(join, qs) + + s = (Search(using=es, index="oersearch") + .query(q))[0:100] # only return up to 100 results for now + + results = s.execute() + + filtered = [ + (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials + for secs in results + if filterSections(secs) + ] + results = [] + for obj, secs in filtered: + # Add the truncated course id + # This is used to point to the resource page for that course + secs["id"] = truncate(obj.meta.id) + secs["title"] = obj.title + if obj["dept"] not in secs["title"]: + secs["dept"] = obj.dept + if obj.books: + secs["books"] = [ + { + "booktitle" : summarize(book[0].encode("ASCII")), + "bookauthor" : book[1].encode("ASCII"), + "bookprice" : book[2].encode("ASCII") + } + for book in obj.books + ] + else: + secs["books"] = "" + results.append(secs) + + return dumps(results) + + +searchers = { + "title" : search("title"), + "loc" : search("loc"), + "time" : search("time"), + "prof" : search("prof"), + "day" : search("day"), + } + +#print searchTerms({"title" : "PHILOS"}) + +#for c in imap(classToJSON, allCourses()): + #try: + #print indexListing(c) + #except UnIndexable as e: diff --git a/src/textbookExceptions.py b/src/textbookExceptions.py new file mode 100644 index 0000000..999ff3e --- /dev/null +++ b/src/textbookExceptions.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +class UnIndexable(Exception): + def __init__(self, course): + self.course = course + + @property + def reason(self): + course = self.course + if not course["code"] and not course["title"]: + message = "there was no course code and no title defined" + if not course["code"]: + message = "there was no course code defined" + if not course["title"]: + message = "there was no course title defined" + if not course["sections"]: + message = "there were no sections defined" + return """ + There was a problem with indexing this course. + %s + There could be several reasons why, my best guess is that %s + We need at least the course code, title, and one or more sections to index + + """ % (course, message) diff --git a/src/visualize.py b/src/visualize.py new file mode 100755 index 0000000..b46a67d --- /dev/null +++ b/src/visualize.py @@ -0,0 +1,97 @@ +#! /usr/bin/python2 + +from json import loads, load +from re import sub, split +from itertools import groupby +from numpy import mean +from operator import attrgetter + +import pygal +import csv + +class Textbook(object): + def __init__(self, dept, code, title, author, price): + self.dept = dept + self.code = code + self.title = title + self.author = author + self.price = float(price) + + def __repr__(self): + return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, + self.code, + self.title, + self.author, + self.price) + + +def courses(): + with open("./books.csv", "r") as books: + booksreader = csv.reader(books) + for row in booksreader: + yield row + + +def groupDept(courselist): + sortedCourses = sorted(courselist, key=attrgetter("dept")) + for course in groupby(sortedCourses, attrgetter("dept")): + yield course[0], list(course[1]) + +def meanPrice(books): + return mean([book.price for book in books]) + +# Questions, +# mean cost per department +# mean cost per faculty +# mean difference between book store copies and other copies per dept and faculty +# number of overlapping books per faculty, do eng students benefit from that? + +# maybe a survey for students to see how often they buy books from other sources +# correlate with how much they could be saving? + +facultyDesc = { + "hum" : "Humanities", + "bus" : "Business", + "hlth" : "Health Science", + "eng" : "Engineering", + "sci" : "Science", + "socsci" : "Social Sciences", + "artsci" : "Arts & Sciences", + "meld" : "MELD" +} + +faculties = load(open("./faculties.json")) + +def categorize(dept): + # faculties + return facultyDesc.get(faculties.get(dept, False), False) + +def byFaculty(): + for dept, books in groupDept(courses()): + yield (categorize(dept), dept, books) + +def meanFacultyCosts(): + byfac = list(byFaculty()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by faculty" + sortedFacs = sorted(byfac, key=lambda x: x[0]) + for fac in groupby(sortedFacs, lambda x: x[0]): + graph.add(fac[0], meanPrice(list(fac[1])[0][2])) + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render(transpose=True) + +def meanCosts(): + cs = groupDept(courses()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by department" + for c in cs: + dept, books = c + graph.add(dept, meanPrice(books)) + #graph.render_to_file("./test_graph.svg") + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render_table(style=True, transpose=True) + +for x in courses(): + print x +#print meanCosts() +#print meanFacultyCosts() diff --git a/src/website.py b/src/website.py new file mode 100755 index 0000000..1fc9374 --- /dev/null +++ b/src/website.py @@ -0,0 +1,148 @@ +#! /usr/bin/python2 +from functools import partial +from couchdb import ResourceConflict + +from flask import Flask, render_template, flash, request, send_from_directory +from flask_bootstrap import Bootstrap +from flask_appconfig import AppConfig +from urllib import unquote +from search import searchTerms + +from openlibrary import bookUrls + +from archive import searchIA +from urllib import quote, unquote +from json import dumps, loads + +from werkzeug.contrib.cache import MemcachedCache +cache = MemcachedCache(['127.0.0.1:11211']) + +import os + +def predict(fieldtype, term): + print fieldtype + print term + if not term: + return "[]" + else: + try: + cs = completers[fieldtype](term.lower()) + except KeyError: + return "[]" + if cs: + return cs + return "[]" + +def predictor(fieldtype): + def inner(request): + params = dict(request.args.items()) + return predict(fieldtype, params["term"]) + return inner + +def cacheit(key, thunk): + """ + Tries to find a cached version of ``key'' + If there is no cached version then it will + evaluate thunk (which must be a generator) + and cache that, then return the result + """ + cached = cache.get(quote(key)) + if cached is None: + result = list(thunk()) + cache.set(quote(key), result) + return result + return cached + +def ClassSearch(configfile=None): + defaults = {"Day", "Building", "Exact Location", "Department"} + app = Flask(__name__) + AppConfig(app, configfile) # Flask-Appconfig is not necessary, but + # highly recommend =) + # https://github.com/mbr/flask-appconfig + Bootstrap(app) + + app.config["scripts"] = "/home/wes/MGOAL/scripts" + app.config["styles"] = "/home/wes/MGOAL/styles" + + @app.route('/favicon.ico') + def favicon(): + return send_from_directory("/srv/http/goal/favicon.ico", + 'favicon.ico', mimetype='image/vnd.microsoft.icon') + + + @app.route("/buildpred", methods=("GET", "POST")) + def buildpred(): + return predictbuild(request) + + @app.route("/locpred", methods=("GET", "POST")) + def locpred(): + return predictloc(request) + + @app.route("/daypred", methods=("GET", "POST")) + def daypred(): + return predictday(request) + + @app.route("/deptpred", methods=("GET", "POST")) + def deptpred(): + return predictdept(request) + + @app.route("/titlepred", methods=("GET", "POST")) + def titlepred(): + return predicttitle(request) + + @app.route("/", methods=("GET", "POST")) + def index(): + return render_template("search.html") + + @app.route("/fc", methods=("GET", "POST")) + def fc(): + """ Filter Courses """ + print "trying to get courses" + params = dict(request.args.items()) + for key, val in params.iteritems(): + if val in defaults: + del params[key] + results = searchTerms(params) + return results + + @app.route("/resources", methods=("GET", "POST")) + def resources(): + """ Get Resources """ + notRequired = False + params = loads(dict(request.args.items())["data"]) + print params + author = params["author"] + title = params["title"] + + if ("No Textbooks" in title or + "No Adoption" in title): + return dumps("false") + + # Cache the result of the open library search + openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) + print openlib + + # cache the result of an internet archive search + iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) + print iarchive + + if not (any(openlib) or any(iarchive)): + # We literally could not find ANYTHING + return dumps("false") + + return dumps({ + "iarchive" : iarchive, + "openlib" : openlib + }) + + @app.route("/scripts/") + def send_script(filename): + return send_from_directory(app.config["scripts"], filename) + + @app.route("/styles/") + def send_style(filename): + return send_from_directory(app.config["styles"], filename) + return app + +if __name__ == "__main__": + ClassSearch().run(port=8001, debug=True)