move files to src/ directory

9 years ago · 28abba8922
9 changed files with 793 additions and 0 deletions
--- a/src/archive.py
+++ b/src/archive.py
@ -0,0 +1,34 @@
+#! /usr/bin/python2
+
+from urllib import quote
+from json import loads, dumps
+
+import requests as req
+
+searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw"
+
+def searchIA(title, author):
+    """
+    Do a search on The Internet Archive for a book
+    """
+    print "running a search"
+    requrl = searchUrl.format(quote(title + " " + author))
+    try:
+        results = loads(req.get(requrl).text[9:][0:-1])
+    except ValueError:
+        return []
+
+    rownum = results["responseHeader"]["params"]["rows"]
+    if rownum < 1:
+        print "Couldn't find results for %s %s" % (title, author)
+        return []
+    docs = results["response"]["docs"]
+    urls = []
+    for result in results["response"]["docs"][0:3]:
+        urls.append("https://archive.org/details/%s" % result["identifier"])
+    return urls
+
+
+# Example, search for David Hume's Enquiry Concerning Human Understanding
+#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"):
+    #print url
--- a/src/database.py
+++ b/src/database.py
@ -0,0 +1,62 @@
+#! /usr/bin/python2
+
+from sys import argv
+from hashlib import sha1
+
+def truncate(docid):
+    """
+    Truncate a document id to 12 digits
+    The document ID should be based on a
+    hash of unique identifiers
+    """
+    return int(str(docid)[0:12])
+
+def createResource(textbookInfo, course, dept, coursecode, docid):
+    """
+    Create a document associated with a course
+    This document contains any/all resources associated
+    with that course
+
+    example,
+    {
+     'books': [],
+     'dept': 'COLLAB',
+     'code': '2C03',
+     'sections': [
+                    {
+                     'prof': 'Lisa Pender',
+                     'sem': '2015/09/08 - 2015/12/08',
+                     'day': 'Mo'
+                     },
+                     {
+                      'prof': 'Staff',
+                      'sem': '2015/09/08 - 2015/12/08',
+                      'day': 'Th'
+                      }
+                  ],
+     'title': 'COLLAB 2C03 - Sociology I'
+     }
+    """
+    textbooks = textbookInfo(dept.strip(), coursecode.strip())
+
+    # We truncate the id so we can have nicer looking URLs
+    # Since the id will be used to point to the resource page for that course
+    _id = str(truncate(docid))
+
+    fields = {
+            "_id" : _id,
+            "textbooks" : textbooks,
+            "coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()),
+            "courseinfo" : course
+            #"Syllabus" : "blah"
+            }
+    try:
+        revisions = list(localdb.revisions(_id))
+        if not revisions:
+            return localdb.save(fields)
+        else:
+            rev = dict(revisions[0])["_rev"]
+            fields["_rev"] = rev
+            return localdb.save(fields)
+    except ResourceConflict:
+        print "Resource for %s already exists, not creating a new one" % (docid)
--- a/src/goasearch.py
+++ b/src/goasearch.py
@ -0,0 +1,14 @@
+#! /usr/bin/python2
+
+# predictive data
+# switch to elasticsearch's prediction
+
+
+
+import database
+import predictions
+
+class GOASearch(object):
+    def __init__(self):
+        return self
+
--- a/src/openlibrary.py
+++ b/src/openlibrary.py
@ -0,0 +1,24 @@
+#! /usr/bin/python2
+
+from urllib import quote
+from json import loads, dumps
+
+import requests as req
+
+#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s"
+searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s'
+
+def bookUrls(title, author):
+    print title, author
+    if ":" in title:
+        title = title.split(":")[0]
+    requrl = searchurl % (quote(author), quote(title))
+    results = loads(req.get(requrl).text)
+    for result in results["docs"][0:2]:
+        if result.has_key("edition_key"):
+            yield "https://openlibrary.org/books/%s" % result["edition_key"][0]
+
+# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle'
+
+#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"):
+    #print book
--- a/src/predictions.py
+++ b/src/predictions.py
@ -0,0 +1,153 @@
+##! /usr/bin/python2
+from itertools import groupby, chain
+from sys import stdout
+from functools import partial
+from json import dumps
+
+def gensymer():
+    n = [0]
+    def inner():
+        result = str(n[0])
+        n[0] += 1
+        return result
+    return inner
+
+gensym = gensymer()
+
+def printTrie(graph, prev, trie, weight):
+    new_node = str(gensym())
+    graph.node(new_node, "%s" % trie.letter)
+    graph.edge(prev, new_node, label="%.2f" % weight)
+    if not trie.children:
+        return
+    for child, weight in zip(trie.children, trie.ws):
+        printTrie(graph, new_node, child, weight)
+
+
+class Trie(object):
+    def __init__(self, letter, children, ws):
+        self.letter = letter
+        self.children = children
+        self.ws = ws
+
+def probweight(suffixes):
+    weights = [float(s["value"]) for s in suffixes]
+    s = float(sum(weights))
+    ws = [w/s for w in weights]
+    return ws
+
+def buildtrie(trie, suffixes):
+    """
+    Build a trie, also known as a prefix tree, of all the possible completions
+    """
+    trie.children = []
+    for letter, suffs in suffixes:
+        ped = partition(suffs)
+        if any(map(lambda p: p[0], ped)):
+            # check if there are any children
+            trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs)))
+        else:
+            # we've reached the end of this word so just include the final letter
+            # [1] = there is a probability of 1 of reaching this single leaf node,
+            # since it is the only possible completion here
+            trie.children.append(Trie(letter, [], [1]))
+    return trie
+
+
+def keyf(x):
+    if not x["key"]:
+        return ""
+    return x["key"][0]
+
+def tails(words):
+    for word in words:
+        yield {
+               "key" : word["key"][1:],
+               "value" : word["value"]
+               }
+
+def partition(words):
+    """
+    Partition the words into different prefixes based on the first character
+    """
+    groups = [
+            (g[0], list(tails(g[1])))
+                for g in groupby(
+                    sorted(words, key=keyf),
+                    key=keyf)
+             ]
+    return groups
+
+
+def flatten_helper(letter, trie):
+    return ([letter + child.letter for
+            child in trie.children], trie.children)
+
+def flatten(trie):
+    if not trie.children:
+        return trie.letter
+    prefixes, suffixes = flatten_helper(trie.letter, trie)
+    return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)]
+
+def flattenlist(xs):
+    locs = []
+    for x in xs:
+        if not isinstance(x, list):
+            locs.append(x)
+        else:
+            locs.extend(flattenlist(x))
+    return locs
+
+def matchc(trie, prefix):
+    c = None
+    if len(prefix) > 1:
+        c = prefix[0]
+    else:
+        c = prefix
+    return [ch for ch in trie.children if ch.letter == c]
+
+def match(trie, word):
+    if not word:
+        return []
+    m = matchc(trie, word[0])
+    if not m:
+        return []
+    else:
+        return [m[0]] + match(m[0], word[1:])
+
+def complete(trie, word):
+    m = match(trie, word)
+    if len(word) != len(m):
+        return False
+    completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))]
+    if len(completions) > 10:
+        return dumps(completions[0:10])
+    return dumps(completions)
+
+def sortTrie(trie):
+    """
+    Sort the children of each node in descending order
+    of the probability that each child would be the completion
+    of whatever that word is
+    """
+    if not trie.children:
+        return
+    sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True)
+    trie.children = [x[0] for x in sortedChilds]
+    trie.ws = [x[1] for x in sortedChilds]
+    for child in trie.children:
+        sortTrie(child)
+
+def toTrie(words):
+    for word in words:
+        word["key"] = word["key"].lower()
+    trie = buildtrie(Trie("", [], [1]), partition(words))
+    trie.ws = [1]*len(trie.children)
+    sortTrie(trie)
+    return trie
+
+def testkey(w):
+    return {
+            "key" : w,
+            "value" : "1"
+            }
--- a/src/search.py
+++ b/src/search.py
@ -0,0 +1,237 @@
+#! /usr/bin/python2
+
+import elasticsearch
+
+from elasticsearch_dsl import FacetedSearch, Search, Q
+from elasticsearch_dsl.aggs import Terms, DateHistogram
+from sys import exit, stderr
+from json import dumps, loads
+from itertools import chain, imap
+
+from hashlib import sha1
+
+from textbookExceptions import UnIndexable
+
+from mcmaster.classes import allCourses
+
+# Generic instance of elasticsearch right now
+es = elasticsearch.Elasticsearch()
+
+def summarize(text):
+    splitted = text.split(" ")
+    if len(splitted) > 4:
+        return " ".join(splitted[0:4]) + ".."
+    return text
+
+def sectionToJSON(section):
+    return {
+            "prof" : section.prof,
+            "sem"  : section.sem,
+            "day"  : section.day
+            }
+
+def classToJSON(clss):
+    return {
+            "title"    : clss.title,
+            "sections" : map(sectionToJSON, clss.sections),
+            "dept"     : clss.dept,
+            "code"     : clss.code,
+            "books"    : list(clss.books) if clss.books else []
+            }
+
+
+def truncate(docid):
+    """
+    Truncate a document id to 12 digits
+    The document ID should be based on a
+    hash of unique identifiers
+    """
+    return int(str(docid)[0:12])
+
+def hashsec(course):
+    """
+    Hash a course into a usable id
+    """
+    if not course["code"]:
+        code = ""
+    else:
+        code = course["code"]
+    if not course["title"]:
+        title = ""
+    else:
+        title = course["title"]
+
+    if not course["sections"] or len(course["sections"]) < 1:
+        course["sections"][0] = ""
+
+    if not (code or title):
+        raise UnIndexable(course)
+
+    h = sha1()
+    h.update(code + title + course["sections"][0]["sem"])
+    return int(h.hexdigest(), 16)
+
+def createIndex(name):
+    """
+    This creates a new index in elasticsearch
+    An index is like a schema in a regular database
+    Create an elasticsearch index
+
+    """
+    indices = elasticsearch.client.IndicesClient(es)
+
+    print indices.create(name)
+    with open("./course.json", "r") as mapping:
+        print indices.put_mapping("course", loads(mapping.read()), name)
+
+def indexListing(course):
+    """
+    Index a specific course in the database (using the courses index)
+    example,
+    {
+     'books': [],
+     'dept': 'COLLAB',
+     'code': '2C03',
+     'sections': [
+                    {
+                     'prof': 'Lisa Pender',
+                     'sem': '2015/09/08 - 2015/12/08',
+                     'day': 'Mo'
+                     },
+                     {
+                      'prof': 'Staff',
+                      'sem': '2015/09/08 - 2015/12/08',
+                      'day': 'Th'
+                      }
+                  ],
+     'title': 'COLLAB 2C03 - Sociology I'
+     }
+
+    """
+    courseID = hashsec(course)
+    print es.index(index="oersearch",
+            doc_type="course",
+            id=courseID,
+            body=course)
+
+    # For every course we index, we also create a resource for it
+    # This should be an idempotent operation because we're putting it in couchdb
+    # And we're using the id obtained from the hash function, so it should just update the document
+    # no need to delete anything
+    #try:
+        #courseDept = course[0]["title"].strip().split(" ")[0].strip()
+        #courseCode = course[0]["title"].strip().split(" ")[1].strip()
+        #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)
+        #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)
+    #except:
+        #print "Couldn't create the resource associated with %s" % course
+
+def termSearch(field):
+    """
+    Make a term search (exact match)
+    """
+    def t(term):
+        q = Q("term",
+                **{
+                    "sections."+field : term
+                    })
+        return q
+    return t
+
+def search(field):
+    """
+    Make a match search
+    """
+    def s(term):
+        q = Q("match",
+                 **{
+                     field : term
+                    })
+        return q
+    return s
+
+def join(x, y):
+    """
+    Join two queries
+    """
+    return x & y
+
+def filterSections(secs):
+    """
+    Get rid of tutorial sections
+    because they almost always have "Staff" as the instructor
+    This is just a heuristic of course
+    """
+    filtered = [s for s in secs.sections if "Staff" not in s.prof]
+    if len(filtered) > 0:
+        return filtered
+    return False
+
+def searchTerms(terms):
+    """
+    Run a search for courses
+    """
+
+    # A list of all the queries we want to run
+    qs = [searchers[field](term) for
+            field, term in
+            terms.iteritems() if
+                term and searchers.has_key(field)]
+
+    if not qs:
+        # No queries = no results
+        return dumps([])
+
+    # Reduce joins all of the queries into one query
+    # It will search for the conjunction of all of them
+    # So that means it cares about each query equally
+    q = reduce(join, qs)
+
+    s = (Search(using=es, index="oersearch")
+        .query(q))[0:100] # only return up to 100 results for now
+
+    results = s.execute()
+
+    filtered = [
+                 (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials
+                  for secs in results
+                    if filterSections(secs)
+               ]
+    results = []
+    for obj, secs in filtered:
+        # Add the truncated course id
+        # This is used to point to the resource page for that course
+        secs["id"] = truncate(obj.meta.id)
+        secs["title"] = obj.title
+        if obj["dept"] not in secs["title"]:
+            secs["dept"] = obj.dept
+        if obj.books:
+            secs["books"] = [
+                             {
+                               "booktitle"  : summarize(book[0].encode("ASCII")),
+                               "bookauthor" : book[1].encode("ASCII"),
+                               "bookprice"  : book[2].encode("ASCII")
+                             }
+                                for book in obj.books
+                            ]
+        else:
+            secs["books"] = ""
+        results.append(secs)
+
+    return dumps(results)
+
+
+searchers = {
+    "title" : search("title"),
+    "loc"   : search("loc"),
+    "time"  : search("time"),
+    "prof"  : search("prof"),
+    "day"   : search("day"),
+    }
+
+#print searchTerms({"title" : "PHILOS"})
+
+#for c in imap(classToJSON, allCourses()):
+    #try:
+        #print indexListing(c)
+    #except UnIndexable as e:
--- a/src/textbookExceptions.py
+++ b/src/textbookExceptions.py
@ -0,0 +1,24 @@
+#! /usr/bin/python2
+
+class UnIndexable(Exception):
+    def __init__(self, course):
+        self.course = course
+
+    @property
+    def reason(self):
+        course = self.course
+        if not course["code"] and not course["title"]:
+            message = "there was no course code and no title defined"
+        if not course["code"]:
+            message = "there was no course code defined"
+        if not course["title"]:
+            message = "there was no course title defined"
+        if not course["sections"]:
+            message = "there were no sections defined"
+        return """
+        There was a problem with indexing this course.
+        %s
+        There could be several reasons why, my best guess is that %s
+        We need at least the course code, title, and one or more sections to index
+
+        """ % (course, message)
--- a/src/visualize.py
+++ b/src/visualize.py
@ -0,0 +1,97 @@
+#! /usr/bin/python2
+
+from json import loads, load
+from re import sub, split
+from itertools import groupby
+from numpy import mean
+from operator import attrgetter
+
+import pygal
+import csv
+
+class Textbook(object):
+    def __init__(self, dept, code, title, author, price):
+        self.dept = dept
+        self.code = code
+        self.title = title
+        self.author = author
+        self.price = float(price)
+
+    def __repr__(self):
+        return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept,
+                                                              self.code,
+                                                              self.title,
+                                                              self.author,
+                                                              self.price)
+
+
+def courses():
+    with open("./books.csv", "r") as books:
+        booksreader = csv.reader(books)
+        for row in booksreader:
+            yield row
+
+
+def groupDept(courselist):
+    sortedCourses = sorted(courselist, key=attrgetter("dept"))
+    for course in groupby(sortedCourses, attrgetter("dept")):
+        yield course[0], list(course[1])
+
+def meanPrice(books):
+    return mean([book.price for book in books])
+
+# Questions,
+# mean cost per department
+# mean cost per faculty
+# mean difference between book store copies and other copies per dept and faculty
+# number of overlapping books per faculty, do eng students benefit from that?
+
+# maybe a survey for students to see how often they buy books from other sources
+# correlate with how much they could be saving?
+
+facultyDesc = {
+        "hum" : "Humanities",
+        "bus" : "Business",
+        "hlth" : "Health Science",
+        "eng" : "Engineering",
+        "sci" : "Science",
+        "socsci" : "Social Sciences",
+        "artsci" : "Arts & Sciences",
+        "meld" : "MELD"
+}
+
+faculties = load(open("./faculties.json"))
+
+def categorize(dept):
+    # faculties
+    return facultyDesc.get(faculties.get(dept, False), False)
+
+def byFaculty():
+    for dept, books in groupDept(courses()):
+        yield (categorize(dept), dept, books)
+
+def meanFacultyCosts():
+    byfac = list(byFaculty())
+    graph = pygal.Bar()
+    graph.title = "Mean textbook cost by faculty"
+    sortedFacs = sorted(byfac, key=lambda x: x[0])
+    for fac in groupby(sortedFacs, lambda x: x[0]):
+        graph.add(fac[0], meanPrice(list(fac[1])[0][2]))
+    graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None"
+    return graph.render(transpose=True)
+
+def meanCosts():
+    cs = groupDept(courses())
+    graph = pygal.Bar()
+    graph.title = "Mean textbook cost by department"
+    for c in cs:
+        dept, books = c
+        graph.add(dept, meanPrice(books))
+    #graph.render_to_file("./test_graph.svg")
+    graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None"
+    return graph.render_table(style=True, transpose=True)
+
+for x in courses():
+    print x
+#print meanCosts()
+#print meanFacultyCosts()
--- a/src/website.py
+++ b/src/website.py
@ -0,0 +1,148 @@
+#! /usr/bin/python2
+from functools import partial
+from couchdb import ResourceConflict
+
+from flask import Flask, render_template, flash, request, send_from_directory
+from flask_bootstrap import Bootstrap
+from flask_appconfig import AppConfig
+from urllib import unquote
+from search import searchTerms
+
+from openlibrary import bookUrls
+
+from archive import searchIA
+from urllib import quote, unquote
+from json import dumps, loads
+
+from werkzeug.contrib.cache import MemcachedCache
+cache = MemcachedCache(['127.0.0.1:11211'])
+
+import os
+
+def predict(fieldtype, term):
+    print fieldtype
+    print term
+    if not term:
+        return "[]"
+    else:
+        try:
+            cs = completers[fieldtype](term.lower())
+        except KeyError:
+            return "[]"
+    if cs:
+        return cs
+    return "[]"
+
+def predictor(fieldtype):
+    def inner(request):
+        params = dict(request.args.items())
+        return predict(fieldtype, params["term"])
+    return inner
+
+def cacheit(key, thunk):
+    """
+    Tries to find a cached version of ``key''
+    If there is no cached version then it will
+    evaluate thunk (which must be a generator)
+    and cache that, then return the result
+    """
+    cached = cache.get(quote(key))
+    if cached is None:
+        result = list(thunk())
+        cache.set(quote(key), result)
+        return result
+    return cached
+
+def ClassSearch(configfile=None):
+    defaults = {"Day", "Building", "Exact Location", "Department"}
+    app = Flask(__name__)
+    AppConfig(app, configfile)  # Flask-Appconfig is not necessary, but
+                                # highly recommend =)
+                                # https://github.com/mbr/flask-appconfig
+    Bootstrap(app)
+
+    app.config["scripts"] = "/home/wes/MGOAL/scripts"
+    app.config["styles"] = "/home/wes/MGOAL/styles"
+
+    @app.route('/favicon.ico')
+    def favicon():
+        return send_from_directory("/srv/http/goal/favicon.ico",
+                                   'favicon.ico', mimetype='image/vnd.microsoft.icon')
+
+
+    @app.route("/buildpred", methods=("GET", "POST"))
+    def buildpred():
+        return predictbuild(request)
+
+    @app.route("/locpred", methods=("GET", "POST"))
+    def locpred():
+        return predictloc(request)
+
+    @app.route("/daypred", methods=("GET", "POST"))
+    def daypred():
+        return predictday(request)
+
+    @app.route("/deptpred", methods=("GET", "POST"))
+    def deptpred():
+        return predictdept(request)
+
+    @app.route("/titlepred", methods=("GET", "POST"))
+    def titlepred():
+        return predicttitle(request)
+
+    @app.route("/", methods=("GET", "POST"))
+    def index():
+        return render_template("search.html")
+
+    @app.route("/fc", methods=("GET", "POST"))
+    def fc():
+        """ Filter Courses """
+        print "trying to get courses"
+        params = dict(request.args.items())
+        for key, val in params.iteritems():
+            if val in defaults:
+                del params[key]
+        results = searchTerms(params)
+        return results
+
+    @app.route("/resources", methods=("GET", "POST"))
+    def resources():
+        """ Get Resources """
+        notRequired = False
+        params = loads(dict(request.args.items())["data"])
+        print params
+        author = params["author"]
+        title = params["title"]
+
+        if ("No Textbooks" in title or
+            "No Adoption" in title):
+            return dumps("false")
+
+        # Cache the result of the open library search
+        openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author))
+        print openlib
+
+        # cache the result of an internet archive search
+        iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author))
+        print iarchive
+
+        if not (any(openlib) or any(iarchive)):
+            # We literally could not find ANYTHING
+            return dumps("false")
+            
+        return dumps({
+                       "iarchive" : iarchive,
+                       "openlib" : openlib
+                     })
+
+    @app.route("/scripts/<filename>")
+    def send_script(filename):
+        return send_from_directory(app.config["scripts"], filename)
+
+    @app.route("/styles/<filename>")
+    def send_style(filename):
+        return send_from_directory(app.config["styles"], filename)
+    return app
+
+if __name__ == "__main__":
+    ClassSearch().run(port=8001, debug=True)