From 52432f4c0c95844c7b74053357a6543195b9971a Mon Sep 17 00:00:00 2001 From: wes Date: Mon, 20 Jun 2016 04:24:25 -0400 Subject: [PATCH] test --- archive.py | 34 ------ database.py | 62 ----------- goasearch.py | 14 --- openlibrary.py | 24 ----- predictions.py | 153 --------------------------- search.py | 237 ------------------------------------------ textbookExceptions.py | 24 ----- visualize.py | 97 ----------------- website.py | 148 -------------------------- 9 files changed, 793 deletions(-) delete mode 100755 archive.py delete mode 100755 database.py delete mode 100755 goasearch.py delete mode 100755 openlibrary.py delete mode 100755 predictions.py delete mode 100755 search.py delete mode 100644 textbookExceptions.py delete mode 100755 visualize.py delete mode 100755 website.py diff --git a/archive.py b/archive.py deleted file mode 100755 index 73fcde7..0000000 --- a/archive.py +++ /dev/null @@ -1,34 +0,0 @@ -#! /usr/bin/python2 - -from urllib import quote -from json import loads, dumps - -import requests as req - -searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" - -def searchIA(title, author): - """ - Do a search on The Internet Archive for a book - """ - print "running a search" - requrl = searchUrl.format(quote(title + " " + author)) - try: - results = loads(req.get(requrl).text[9:][0:-1]) - except ValueError: - return [] - - rownum = results["responseHeader"]["params"]["rows"] - if rownum < 1: - print "Couldn't find results for %s %s" % (title, author) - return [] - docs = results["response"]["docs"] - urls = [] - for result in results["response"]["docs"][0:3]: - urls.append("https://archive.org/details/%s" % result["identifier"]) - return urls - - -# Example, search for David Hume's Enquiry Concerning Human Understanding -#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): - #print url diff --git a/database.py b/database.py deleted file mode 100755 index a19272c..0000000 --- a/database.py +++ /dev/null @@ -1,62 +0,0 @@ -#! /usr/bin/python2 - -from sys import argv -from hashlib import sha1 - -def truncate(docid): - """ - Truncate a document id to 12 digits - The document ID should be based on a - hash of unique identifiers - """ - return int(str(docid)[0:12]) - -def createResource(textbookInfo, course, dept, coursecode, docid): - """ - Create a document associated with a course - This document contains any/all resources associated - with that course - - example, - { - 'books': [], - 'dept': 'COLLAB', - 'code': '2C03', - 'sections': [ - { - 'prof': 'Lisa Pender', - 'sem': '2015/09/08 - 2015/12/08', - 'day': 'Mo' - }, - { - 'prof': 'Staff', - 'sem': '2015/09/08 - 2015/12/08', - 'day': 'Th' - } - ], - 'title': 'COLLAB 2C03 - Sociology I' - } - """ - textbooks = textbookInfo(dept.strip(), coursecode.strip()) - - # We truncate the id so we can have nicer looking URLs - # Since the id will be used to point to the resource page for that course - _id = str(truncate(docid)) - - fields = { - "_id" : _id, - "textbooks" : textbooks, - "coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), - "courseinfo" : course - #"Syllabus" : "blah" - } - try: - revisions = list(localdb.revisions(_id)) - if not revisions: - return localdb.save(fields) - else: - rev = dict(revisions[0])["_rev"] - fields["_rev"] = rev - return localdb.save(fields) - except ResourceConflict: - print "Resource for %s already exists, not creating a new one" % (docid) diff --git a/goasearch.py b/goasearch.py deleted file mode 100755 index 3dca7eb..0000000 --- a/goasearch.py +++ /dev/null @@ -1,14 +0,0 @@ -#! /usr/bin/python2 - -# predictive data -# switch to elasticsearch's prediction - - - -import database -import predictions - -class GOASearch(object): - def __init__(self): - return self - diff --git a/openlibrary.py b/openlibrary.py deleted file mode 100755 index d558c21..0000000 --- a/openlibrary.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/python2 - -from urllib import quote -from json import loads, dumps - -import requests as req - -#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" -searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' - -def bookUrls(title, author): - print title, author - if ":" in title: - title = title.split(":")[0] - requrl = searchurl % (quote(author), quote(title)) - results = loads(req.get(requrl).text) - for result in results["docs"][0:2]: - if result.has_key("edition_key"): - yield "https://openlibrary.org/books/%s" % result["edition_key"][0] - -# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' - -#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): - #print book diff --git a/predictions.py b/predictions.py deleted file mode 100755 index b770a0b..0000000 --- a/predictions.py +++ /dev/null @@ -1,153 +0,0 @@ -##! /usr/bin/python2 -from itertools import groupby, chain -from sys import stdout -from functools import partial -from json import dumps - -def gensymer(): - n = [0] - def inner(): - result = str(n[0]) - n[0] += 1 - return result - return inner - -gensym = gensymer() - -def printTrie(graph, prev, trie, weight): - new_node = str(gensym()) - graph.node(new_node, "%s" % trie.letter) - graph.edge(prev, new_node, label="%.2f" % weight) - if not trie.children: - return - for child, weight in zip(trie.children, trie.ws): - printTrie(graph, new_node, child, weight) - - -class Trie(object): - def __init__(self, letter, children, ws): - self.letter = letter - self.children = children - self.ws = ws - -def probweight(suffixes): - weights = [float(s["value"]) for s in suffixes] - s = float(sum(weights)) - ws = [w/s for w in weights] - return ws - -def buildtrie(trie, suffixes): - """ - Build a trie, also known as a prefix tree, of all the possible completions - """ - trie.children = [] - for letter, suffs in suffixes: - ped = partition(suffs) - if any(map(lambda p: p[0], ped)): - # check if there are any children - trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) - else: - # we've reached the end of this word so just include the final letter - # [1] = there is a probability of 1 of reaching this single leaf node, - # since it is the only possible completion here - trie.children.append(Trie(letter, [], [1])) - return trie - - -def keyf(x): - if not x["key"]: - return "" - return x["key"][0] - -def tails(words): - for word in words: - yield { - "key" : word["key"][1:], - "value" : word["value"] - } - -def partition(words): - """ - Partition the words into different prefixes based on the first character - """ - groups = [ - (g[0], list(tails(g[1]))) - for g in groupby( - sorted(words, key=keyf), - key=keyf) - ] - return groups - - -def flatten_helper(letter, trie): - return ([letter + child.letter for - child in trie.children], trie.children) - -def flatten(trie): - if not trie.children: - return trie.letter - prefixes, suffixes = flatten_helper(trie.letter, trie) - return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] - -def flattenlist(xs): - locs = [] - for x in xs: - if not isinstance(x, list): - locs.append(x) - else: - locs.extend(flattenlist(x)) - return locs - -def matchc(trie, prefix): - c = None - if len(prefix) > 1: - c = prefix[0] - else: - c = prefix - return [ch for ch in trie.children if ch.letter == c] - -def match(trie, word): - if not word: - return [] - m = matchc(trie, word[0]) - if not m: - return [] - else: - return [m[0]] + match(m[0], word[1:]) - -def complete(trie, word): - m = match(trie, word) - if len(word) != len(m): - return False - completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] - if len(completions) > 10: - return dumps(completions[0:10]) - return dumps(completions) - -def sortTrie(trie): - """ - Sort the children of each node in descending order - of the probability that each child would be the completion - of whatever that word is - """ - if not trie.children: - return - sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) - trie.children = [x[0] for x in sortedChilds] - trie.ws = [x[1] for x in sortedChilds] - for child in trie.children: - sortTrie(child) - -def toTrie(words): - for word in words: - word["key"] = word["key"].lower() - trie = buildtrie(Trie("", [], [1]), partition(words)) - trie.ws = [1]*len(trie.children) - sortTrie(trie) - return trie - -def testkey(w): - return { - "key" : w, - "value" : "1" - } diff --git a/search.py b/search.py deleted file mode 100755 index 777222f..0000000 --- a/search.py +++ /dev/null @@ -1,237 +0,0 @@ -#! /usr/bin/python2 - -import elasticsearch - -from elasticsearch_dsl import FacetedSearch, Search, Q -from elasticsearch_dsl.aggs import Terms, DateHistogram -from sys import exit, stderr -from json import dumps, loads -from itertools import chain, imap - -from hashlib import sha1 - -from textbookExceptions import UnIndexable - -from mcmaster.classes import allCourses - -# Generic instance of elasticsearch right now -es = elasticsearch.Elasticsearch() - -def summarize(text): - splitted = text.split(" ") - if len(splitted) > 4: - return " ".join(splitted[0:4]) + ".." - return text - -def sectionToJSON(section): - return { - "prof" : section.prof, - "sem" : section.sem, - "day" : section.day - } - -def classToJSON(clss): - return { - "title" : clss.title, - "sections" : map(sectionToJSON, clss.sections), - "dept" : clss.dept, - "code" : clss.code, - "books" : list(clss.books) if clss.books else [] - } - - -def truncate(docid): - """ - Truncate a document id to 12 digits - The document ID should be based on a - hash of unique identifiers - """ - return int(str(docid)[0:12]) - -def hashsec(course): - """ - Hash a course into a usable id - """ - if not course["code"]: - code = "" - else: - code = course["code"] - if not course["title"]: - title = "" - else: - title = course["title"] - - if not course["sections"] or len(course["sections"]) < 1: - course["sections"][0] = "" - - if not (code or title): - raise UnIndexable(course) - - h = sha1() - h.update(code + title + course["sections"][0]["sem"]) - return int(h.hexdigest(), 16) - -def createIndex(name): - """ - This creates a new index in elasticsearch - An index is like a schema in a regular database - Create an elasticsearch index - - """ - indices = elasticsearch.client.IndicesClient(es) - - print indices.create(name) - with open("./course.json", "r") as mapping: - print indices.put_mapping("course", loads(mapping.read()), name) - -def indexListing(course): - """ - Index a specific course in the database (using the courses index) - example, - { - 'books': [], - 'dept': 'COLLAB', - 'code': '2C03', - 'sections': [ - { - 'prof': 'Lisa Pender', - 'sem': '2015/09/08 - 2015/12/08', - 'day': 'Mo' - }, - { - 'prof': 'Staff', - 'sem': '2015/09/08 - 2015/12/08', - 'day': 'Th' - } - ], - 'title': 'COLLAB 2C03 - Sociology I' - } - - """ - courseID = hashsec(course) - print es.index(index="oersearch", - doc_type="course", - id=courseID, - body=course) - - # For every course we index, we also create a resource for it - # This should be an idempotent operation because we're putting it in couchdb - # And we're using the id obtained from the hash function, so it should just update the document - # no need to delete anything - #try: - #courseDept = course[0]["title"].strip().split(" ")[0].strip() - #courseCode = course[0]["title"].strip().split(" ")[1].strip() - #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) - #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) - #except: - #print "Couldn't create the resource associated with %s" % course - -def termSearch(field): - """ - Make a term search (exact match) - """ - def t(term): - q = Q("term", - **{ - "sections."+field : term - }) - return q - return t - -def search(field): - """ - Make a match search - """ - def s(term): - q = Q("match", - **{ - field : term - }) - return q - return s - -def join(x, y): - """ - Join two queries - """ - return x & y - -def filterSections(secs): - """ - Get rid of tutorial sections - because they almost always have "Staff" as the instructor - This is just a heuristic of course - """ - filtered = [s for s in secs.sections if "Staff" not in s.prof] - if len(filtered) > 0: - return filtered - return False - -def searchTerms(terms): - """ - Run a search for courses - """ - - # A list of all the queries we want to run - qs = [searchers[field](term) for - field, term in - terms.iteritems() if - term and searchers.has_key(field)] - - if not qs: - # No queries = no results - return dumps([]) - - # Reduce joins all of the queries into one query - # It will search for the conjunction of all of them - # So that means it cares about each query equally - q = reduce(join, qs) - - s = (Search(using=es, index="oersearch") - .query(q))[0:100] # only return up to 100 results for now - - results = s.execute() - - filtered = [ - (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials - for secs in results - if filterSections(secs) - ] - results = [] - for obj, secs in filtered: - # Add the truncated course id - # This is used to point to the resource page for that course - secs["id"] = truncate(obj.meta.id) - secs["title"] = obj.title - if obj["dept"] not in secs["title"]: - secs["dept"] = obj.dept - if obj.books: - secs["books"] = [ - { - "booktitle" : summarize(book[0].encode("ASCII")), - "bookauthor" : book[1].encode("ASCII"), - "bookprice" : book[2].encode("ASCII") - } - for book in obj.books - ] - else: - secs["books"] = "" - results.append(secs) - - return dumps(results) - - -searchers = { - "title" : search("title"), - "loc" : search("loc"), - "time" : search("time"), - "prof" : search("prof"), - "day" : search("day"), - } - -#print searchTerms({"title" : "PHILOS"}) - -#for c in imap(classToJSON, allCourses()): - #try: - #print indexListing(c) - #except UnIndexable as e: diff --git a/textbookExceptions.py b/textbookExceptions.py deleted file mode 100644 index 999ff3e..0000000 --- a/textbookExceptions.py +++ /dev/null @@ -1,24 +0,0 @@ -#! /usr/bin/python2 - -class UnIndexable(Exception): - def __init__(self, course): - self.course = course - - @property - def reason(self): - course = self.course - if not course["code"] and not course["title"]: - message = "there was no course code and no title defined" - if not course["code"]: - message = "there was no course code defined" - if not course["title"]: - message = "there was no course title defined" - if not course["sections"]: - message = "there were no sections defined" - return """ - There was a problem with indexing this course. - %s - There could be several reasons why, my best guess is that %s - We need at least the course code, title, and one or more sections to index - - """ % (course, message) diff --git a/visualize.py b/visualize.py deleted file mode 100755 index b46a67d..0000000 --- a/visualize.py +++ /dev/null @@ -1,97 +0,0 @@ -#! /usr/bin/python2 - -from json import loads, load -from re import sub, split -from itertools import groupby -from numpy import mean -from operator import attrgetter - -import pygal -import csv - -class Textbook(object): - def __init__(self, dept, code, title, author, price): - self.dept = dept - self.code = code - self.title = title - self.author = author - self.price = float(price) - - def __repr__(self): - return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, - self.code, - self.title, - self.author, - self.price) - - -def courses(): - with open("./books.csv", "r") as books: - booksreader = csv.reader(books) - for row in booksreader: - yield row - - -def groupDept(courselist): - sortedCourses = sorted(courselist, key=attrgetter("dept")) - for course in groupby(sortedCourses, attrgetter("dept")): - yield course[0], list(course[1]) - -def meanPrice(books): - return mean([book.price for book in books]) - -# Questions, -# mean cost per department -# mean cost per faculty -# mean difference between book store copies and other copies per dept and faculty -# number of overlapping books per faculty, do eng students benefit from that? - -# maybe a survey for students to see how often they buy books from other sources -# correlate with how much they could be saving? - -facultyDesc = { - "hum" : "Humanities", - "bus" : "Business", - "hlth" : "Health Science", - "eng" : "Engineering", - "sci" : "Science", - "socsci" : "Social Sciences", - "artsci" : "Arts & Sciences", - "meld" : "MELD" -} - -faculties = load(open("./faculties.json")) - -def categorize(dept): - # faculties - return facultyDesc.get(faculties.get(dept, False), False) - -def byFaculty(): - for dept, books in groupDept(courses()): - yield (categorize(dept), dept, books) - -def meanFacultyCosts(): - byfac = list(byFaculty()) - graph = pygal.Bar() - graph.title = "Mean textbook cost by faculty" - sortedFacs = sorted(byfac, key=lambda x: x[0]) - for fac in groupby(sortedFacs, lambda x: x[0]): - graph.add(fac[0], meanPrice(list(fac[1])[0][2])) - graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" - return graph.render(transpose=True) - -def meanCosts(): - cs = groupDept(courses()) - graph = pygal.Bar() - graph.title = "Mean textbook cost by department" - for c in cs: - dept, books = c - graph.add(dept, meanPrice(books)) - #graph.render_to_file("./test_graph.svg") - graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" - return graph.render_table(style=True, transpose=True) - -for x in courses(): - print x -#print meanCosts() -#print meanFacultyCosts() diff --git a/website.py b/website.py deleted file mode 100755 index 1fc9374..0000000 --- a/website.py +++ /dev/null @@ -1,148 +0,0 @@ -#! /usr/bin/python2 -from functools import partial -from couchdb import ResourceConflict - -from flask import Flask, render_template, flash, request, send_from_directory -from flask_bootstrap import Bootstrap -from flask_appconfig import AppConfig -from urllib import unquote -from search import searchTerms - -from openlibrary import bookUrls - -from archive import searchIA -from urllib import quote, unquote -from json import dumps, loads - -from werkzeug.contrib.cache import MemcachedCache -cache = MemcachedCache(['127.0.0.1:11211']) - -import os - -def predict(fieldtype, term): - print fieldtype - print term - if not term: - return "[]" - else: - try: - cs = completers[fieldtype](term.lower()) - except KeyError: - return "[]" - if cs: - return cs - return "[]" - -def predictor(fieldtype): - def inner(request): - params = dict(request.args.items()) - return predict(fieldtype, params["term"]) - return inner - -def cacheit(key, thunk): - """ - Tries to find a cached version of ``key'' - If there is no cached version then it will - evaluate thunk (which must be a generator) - and cache that, then return the result - """ - cached = cache.get(quote(key)) - if cached is None: - result = list(thunk()) - cache.set(quote(key), result) - return result - return cached - -def ClassSearch(configfile=None): - defaults = {"Day", "Building", "Exact Location", "Department"} - app = Flask(__name__) - AppConfig(app, configfile) # Flask-Appconfig is not necessary, but - # highly recommend =) - # https://github.com/mbr/flask-appconfig - Bootstrap(app) - - app.config["scripts"] = "/home/wes/MGOAL/scripts" - app.config["styles"] = "/home/wes/MGOAL/styles" - - @app.route('/favicon.ico') - def favicon(): - return send_from_directory("/srv/http/goal/favicon.ico", - 'favicon.ico', mimetype='image/vnd.microsoft.icon') - - - @app.route("/buildpred", methods=("GET", "POST")) - def buildpred(): - return predictbuild(request) - - @app.route("/locpred", methods=("GET", "POST")) - def locpred(): - return predictloc(request) - - @app.route("/daypred", methods=("GET", "POST")) - def daypred(): - return predictday(request) - - @app.route("/deptpred", methods=("GET", "POST")) - def deptpred(): - return predictdept(request) - - @app.route("/titlepred", methods=("GET", "POST")) - def titlepred(): - return predicttitle(request) - - @app.route("/", methods=("GET", "POST")) - def index(): - return render_template("search.html") - - @app.route("/fc", methods=("GET", "POST")) - def fc(): - """ Filter Courses """ - print "trying to get courses" - params = dict(request.args.items()) - for key, val in params.iteritems(): - if val in defaults: - del params[key] - results = searchTerms(params) - return results - - @app.route("/resources", methods=("GET", "POST")) - def resources(): - """ Get Resources """ - notRequired = False - params = loads(dict(request.args.items())["data"]) - print params - author = params["author"] - title = params["title"] - - if ("No Textbooks" in title or - "No Adoption" in title): - return dumps("false") - - # Cache the result of the open library search - openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) - print openlib - - # cache the result of an internet archive search - iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) - print iarchive - - if not (any(openlib) or any(iarchive)): - # We literally could not find ANYTHING - return dumps("false") - - return dumps({ - "iarchive" : iarchive, - "openlib" : openlib - }) - - @app.route("/scripts/") - def send_script(filename): - return send_from_directory(app.config["scripts"], filename) - - @app.route("/styles/") - def send_style(filename): - return send_from_directory(app.config["styles"], filename) - return app - -if __name__ == "__main__": - ClassSearch().run(port=8001, debug=True)