From a225b948329fda1e6216442322fa2de078bb4c04 Mon Sep 17 00:00:00 2001 From: wes Date: Mon, 20 Jun 2016 04:23:19 -0400 Subject: [PATCH] Revert "Explain the purpose of the Racket files" This reverts commit dc1d0c87b28eacb1fdf9866e74c74e91d2c565de. --- archive.py | 34 ++++ course_mapping.rkt | 6 - database.py | 62 +++++++ goasearch.py | 14 ++ mcmaster/__init__.py | 0 mcmaster/classes.py | 349 ++++++++++++++++++++++++++++++++++++++ mcmaster/site.py | 9 + mcmaster/sylla.py | 117 +++++++++++++ openlibrary.py | 24 +++ predictions.py | 153 +++++++++++++++++ schemadsl.rkt | 3 +- search.py | 237 ++++++++++++++++++++++++++ src/archive.py | 34 ++++ src/database.py | 62 +++++++ src/goasearch.py | 14 ++ src/openlibrary.py | 24 +++ src/predictions.py | 153 +++++++++++++++++ src/search.py | 237 ++++++++++++++++++++++++++ src/textbookExceptions.py | 24 +++ src/visualize.py | 97 +++++++++++ src/website.py | 148 ++++++++++++++++ textbookExceptions.py | 24 +++ visualize.py | 97 +++++++++++ website.py | 148 ++++++++++++++++ 24 files changed, 2062 insertions(+), 8 deletions(-) create mode 100755 archive.py create mode 100755 database.py create mode 100755 goasearch.py create mode 100644 mcmaster/__init__.py create mode 100755 mcmaster/classes.py create mode 100644 mcmaster/site.py create mode 100755 mcmaster/sylla.py create mode 100755 openlibrary.py create mode 100755 predictions.py create mode 100755 search.py create mode 100755 src/archive.py create mode 100755 src/database.py create mode 100755 src/goasearch.py create mode 100755 src/openlibrary.py create mode 100755 src/predictions.py create mode 100755 src/search.py create mode 100644 src/textbookExceptions.py create mode 100755 src/visualize.py create mode 100755 src/website.py create mode 100644 textbookExceptions.py create mode 100755 visualize.py create mode 100755 website.py diff --git a/archive.py b/archive.py new file mode 100755 index 0000000..73fcde7 --- /dev/null +++ b/archive.py @@ -0,0 +1,34 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" + +def searchIA(title, author): + """ + Do a search on The Internet Archive for a book + """ + print "running a search" + requrl = searchUrl.format(quote(title + " " + author)) + try: + results = loads(req.get(requrl).text[9:][0:-1]) + except ValueError: + return [] + + rownum = results["responseHeader"]["params"]["rows"] + if rownum < 1: + print "Couldn't find results for %s %s" % (title, author) + return [] + docs = results["response"]["docs"] + urls = [] + for result in results["response"]["docs"][0:3]: + urls.append("https://archive.org/details/%s" % result["identifier"]) + return urls + + +# Example, search for David Hume's Enquiry Concerning Human Understanding +#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): + #print url diff --git a/course_mapping.rkt b/course_mapping.rkt index b08b666..65a1e2e 100755 --- a/course_mapping.rkt +++ b/course_mapping.rkt @@ -1,11 +1,5 @@ #! /usr/bin/racket #lang racket -; This file is used to generate the mapping for elasticsearch -; It is written in Racket (a dialect of Scheme) -; It will not be necessary to run unless you want to change the elasticsearch mapping -; This may be necessary if you have fields you want to add, or need some other customization -; You may also edit the JSON mapping directly, or use whatever tool you want to edit the mapping with - (require "schemadsl.rkt") (displayln diff --git a/database.py b/database.py new file mode 100755 index 0000000..a19272c --- /dev/null +++ b/database.py @@ -0,0 +1,62 @@ +#! /usr/bin/python2 + +from sys import argv +from hashlib import sha1 + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def createResource(textbookInfo, course, dept, coursecode, docid): + """ + Create a document associated with a course + This document contains any/all resources associated + with that course + + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + """ + textbooks = textbookInfo(dept.strip(), coursecode.strip()) + + # We truncate the id so we can have nicer looking URLs + # Since the id will be used to point to the resource page for that course + _id = str(truncate(docid)) + + fields = { + "_id" : _id, + "textbooks" : textbooks, + "coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), + "courseinfo" : course + #"Syllabus" : "blah" + } + try: + revisions = list(localdb.revisions(_id)) + if not revisions: + return localdb.save(fields) + else: + rev = dict(revisions[0])["_rev"] + fields["_rev"] = rev + return localdb.save(fields) + except ResourceConflict: + print "Resource for %s already exists, not creating a new one" % (docid) diff --git a/goasearch.py b/goasearch.py new file mode 100755 index 0000000..3dca7eb --- /dev/null +++ b/goasearch.py @@ -0,0 +1,14 @@ +#! /usr/bin/python2 + +# predictive data +# switch to elasticsearch's prediction + + + +import database +import predictions + +class GOASearch(object): + def __init__(self): + return self + diff --git a/mcmaster/__init__.py b/mcmaster/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mcmaster/classes.py b/mcmaster/classes.py new file mode 100755 index 0000000..54687df --- /dev/null +++ b/mcmaster/classes.py @@ -0,0 +1,349 @@ +#! /usr/bin/python2 + +from sys import argv +from itertools import chain, islice, izip as zip +from re import search, sub +from functools import total_ordering + +from sylla import textbookInfo +from collections import MutableMapping + +import datetime as dt +import lxml.html as lxh +import requests +import sys +import copy + +fall = "2159" +spring_summer = "2165" +winter = "2161" + +# threading stuff +import Queue as q +import threading as thd + +baseurl = "https://applicants.mcmaster.ca/psp/prepprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" + +searchurl = "https://csprd.mcmaster.ca/psc/prcsprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" + +custom_headers = { + "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0", + "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8", + } + +courseCodes1 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_SUBJ_SRCH%240&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=5tq9x%2Fjt42mf62Sh5z%2BrjxT0gT15kiIyQ2cecCSmRB4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" + +courseCodes2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=SSR_CLSRCH_WRK2_SSR_ALPHANUM_{1}&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=vIUgl6ZXw045S07EPbQw4RDzv7NmKCDdJFdT4CTRQNM%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={2}" + +payload2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=%23ICSave&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" + +payload = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&SSR_CLSRCH_WRK_SUBJECT$75$$0={1}&CLASS_SRCH_WRK2_STRM$45$={2}" + + +year = dt.date.today().year +month = dt.date.today().month + +days = { + "Mo" : 0, + "Tu" : 1, + "We" : 2, + "Th" : 3, + "Fr" : 4, + "Sa" : 5, + "Su" : 6 + } + +day_descs = { + "Mo" : "Monday Mon Mo", + "Tu" : "Tuesday Tues Tu Tue", + "We" : "Wednesday Wed We", + "Th" : "Thursday Th Thurs", + "Fr" : "Friday Fr Fri", + "Sa" : "Saturday Sat Sa", + "Su" : "Sunday Su Sun", + "T" : "TBA" + } + +def timeparse(time): + """ + Parse the time into numbers + """ + if len(time) == 7: + hour = int(time[0:2]) + minutes = int(time[3:5]) + half = time[5:7] + else: + hour = int(time[0]) + minutes = int(time[2:4]) + half = time[4:6] + if half == "PM": + if hour < 12: + hour = hour + 12 + + return (str(hour), str(minutes), half) + +class Class(object): + def __init__(self, dept, title, sections): + self.title = title.encode("UTF-8") + self.sections = sections + self.dept = dept + + def __repr__(self): + return repr((self.title, self.sections)) + + def __iter__(self): + return iter((self.title, sec) for sec in self.sections) + + def hasCode(self): + splitted = self.title.strip().split(" ") + return ((len(splitted) >= 2) and + (splitted[0].upper() == splitted[0]) and + (splitted[1].upper() == splitted[1])) + + @property + def code(self): + if self.hasCode(): + return self.title.strip().split(" ")[1].strip() + return False + + @property + def books(self): + if self.dept and self.code: + return textbookInfo(self.dept, self.code, withPrices=True) + return False + +@total_ordering +class Section(dict): + def __init__(self, time, loc, prof, sem): + self.time = time.encode("UTF-8") + self.loc = loc.encode("UTF-8") + self.prof = prof.encode("UTF-8") + self.sem = sem.encode("UTF-8") + self._date = False + self._day = False + + @property + def date(self): + if self.time != "TBA": + day, start, _, end = self.time.split() + + if self._day: + assert len(self._day) == 2 + day = self._day + else: + day = [day[n:n+2] for n in xrange(0, len(day)-1, 2)] + + self._date = (day, timeparse(start), timeparse(end)) + + return self._date + + return self.time + + @property + def day(self): + return self.date[0] + + @property + def start(self): + return self.date[1][0] + self.date[1][1] + + def __repr__(self): + return (""" + Time = %s, Location = %s, Instructor = %s, Semester Running = %s + """ % (self.date, self.loc, self.prof, self.sem)) + def __gt__(self, x): + if isinstance(self.day, list): + raise NotImplementedError + + if (self.date == "TBA" or + x.date == "TBA"): + return False + + return ((days[self.day] > days[x.day]) or + ((self.day == x.day) and + (self.start > x.start))) + + def __eq__(self, x): + return (x.date == self.date and + x.prof == self.prof and + x.loc == self.loc and + x.sem == self.sem) + + +def getStateNum(html): + """ + Get the state num from Mosaic + This is unique to each requester + """ + parsed = lxh.fromstring(html) + return parsed.xpath(".//input[@name=\"ICStateNum\"]")[0].value + +def parseSection(section): + cols = section.xpath(".//td") + assert len(cols) == 4 + time, loc, prof, sem = [col.text_content().encode("UTF-8").strip() for col in cols] + + classinfo = Section(time, loc, prof, sem) + return classinfo + +def getSectionInfo(table): + trs = table.xpath(".//tr") + for tr in trs: + if tr.xpath("@id") and search(r"SSR_CLSRCH", tr.xpath("@id")[0]): + yield parseSection(tr) + +def parseColumns(subject, html): + parsed = lxh.fromstring(html) + + classInfo = (list(getSectionInfo(table)) for table in + islice((table for table in parsed.xpath(".//table") + if table.xpath("@id") and + search(r"ICField[0-9]+\$scroll", table.xpath("@id")[0])), 1, sys.maxint)) + + classNames = ((subject, span.text_content().strip()) for span in parsed.xpath(".//span") + if span.xpath("@id") and + search(r"DERIVED_CLSRCH_DESCR", span.xpath("@id")[0])) + + return zip(classNames, classInfo) + +def getCodes(html): + parsed = lxh.fromstring(html) + + return (code.text_content().encode("UTF-8") for code in + parsed.xpath("//span") + if code.xpath("@id") and + search(r"SSR_CLSRCH_SUBJ_SUBJECT\$[0-9]+", code.xpath("@id")[0])) + +class MosReq(object): + def __init__(self, semester): + self.semester = semester + s = requests.Session() + resp = s.get(baseurl, allow_redirects=True, headers=custom_headers).content + + # Let the server set some cookies before doing the searching + cookies = {} + for key, val in s.cookies.iteritems(): + cookies[key] = val + self.cookies = cookies + self.statenum = False + self.codes_ = [] + + def getlist(self, subject): + sys.stderr.write("Getting " + subject + "\n") + first_req = requests.get(searchurl, cookies=self.cookies).content + # for some reason Mosaic wants us to request it twice, ?????????????????? + self.statenum = getStateNum(first_req) + first_req = requests.post(searchurl, + data=payload.format(self.statenum, subject, self.semester), + cookies=self.cookies, + allow_redirects=False, + headers=custom_headers).content + # we make a first request to get the ICStateNum in case it thinks there are too many results + try: + self.statenum = getStateNum(first_req) + except IndexError: + pass + if "Your search will return over" in first_req: + + return requests.post(searchurl, + data=payload2.format(self.statenum, self.semester), + cookies=self.cookies, + allow_redirects=False, + headers=custom_headers).content + else: + return first_req + + def classes(self, subject): + return list(parseColumns(subject, self.getlist(subject))) + + def getCodes(self, letter): + sys.stderr.write("Getting letter " + letter + "\n") + first_req = requests.get(searchurl, cookies=self.cookies).content + self.statenum = getStateNum(first_req) + + self.statenum = getStateNum(requests.post(searchurl, + data=courseCodes1.format(self.statenum, self.semester), + cookies=self.cookies, + headers=custom_headers).content) + + return getCodes(requests.post(searchurl, + data=courseCodes2.format(self.statenum, letter, self.semester), + cookies=self.cookies, + allow_redirects=False, + headers=custom_headers).content) + @property + def codes(self): + if not self.codes_: + self.codes_ = list(chain.from_iterable( + map((lambda l: + self.getCodes(chr(l))), + xrange(65, 91)))) + return self.codes_ + +def request(codes, lists, semester): + requester = MosReq(semester) + while not codes.empty(): + code = codes.get() + try: + lists.put(requester.classes(code)) + except: + codes.task_done() + return + codes.task_done() + + +class CourseInfo(object): + def __init__(self, threadcount, semester): + self._codes = False + self.threadcount = threadcount + self.semester = semester + + @property + def codes(self): + if not self._codes: + req = MosReq(self.semester) + self._codes = req.codes + return self._codes + + def classes(self): + qcodes = q.Queue() + for code in self.codes: + qcodes.put(code) + lists = q.Queue() + threads = [] + thread = None + for i in xrange(self.threadcount): + thread = thd.Thread(group=None, target=request, args=(qcodes, lists, self.semester)) + threads.append(thread) + thread.start() + qcodes.join() + for t in threads: + t.join() + + sections = [] + while not lists.empty(): + sections.append(lists.get()) + + for cl in chain.from_iterable(sections): + new_sections = [] + for sec in cl[1]: + if len(sec.day) > 1: + for day in sec.day: + new_sections.append(copy.deepcopy(sec)) + new_sections[-1]._day = day + else: + sec._day = sec.day[0] + new_sections.append(sec) + yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections)) + +def getCourses(semester, threadcount=10): + return CourseInfo(threadcount, semester).classes() + +def allCourses(): + return chain.from_iterable( + (getCourses(sem, threadcount=10) + for sem in (fall, winter, spring_summer))) + +#for course in allCourses(): + #sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, course.books)) + #print course.sections diff --git a/mcmaster/site.py b/mcmaster/site.py new file mode 100644 index 0000000..42c07aa --- /dev/null +++ b/mcmaster/site.py @@ -0,0 +1,9 @@ +from oersearch import Search +from classes import getCourses +from sylla import getTextbooks + +mcmasterSearch = Search("McMaster") + +mcmasterSearch.setup(getCourses) + +mcmasterSearch.run() diff --git a/mcmaster/sylla.py b/mcmaster/sylla.py new file mode 100755 index 0000000..6347e70 --- /dev/null +++ b/mcmaster/sylla.py @@ -0,0 +1,117 @@ +#! /usr/bin/python2 + +from sys import argv +from itertools import chain, islice, izip_longest, izip as zip +from re import search, sub +from functools import total_ordering +from re import sub + +import datetime as dt +import lxml.html as lxh +import requests + +# Purpose of this module is to download and parse syllabi from various departments +# In order to be corellated with individual courses + +class Price(object): + def __init__(self, amnt, status): + self.dollars = float(amnt[1:]) + self.status = status + + def __repr__(self): + return "$%s %s" % (repr(self.dollars), self.status) + + +class Book(object): + def __init__(self, title, price): + self.title = title + self.price = price + + def __repr__(self): + return '["%s", "%s"]' % (self.title, repr(self.price)) + + +def grouper(n, iterable, fillvalue=None): + "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return izip_longest(fillvalue=fillvalue, *args) + +searchUrl = "https://campusstore.mcmaster.ca/cgi-mcm/ws/txsub.pl?wsDEPTG1=%s&wsDEPTDESC1=&wsCOURSEG1=%s&crit_cnt=1" + +def normalize(word): + if len(word) > 1: + return ("%s%s" % + (word[0].upper(), + "".join(word[1:]).lower())) + return word + +def parseAuthor(author): + split = author.split(" ") + if len(split) <= 1: + return author + lastname = split[0] + firstname = split[1] + return "%s %s" % (firstname, lastname) + +def normwords(phrase): + words = phrase.split(" ") + return " ".join(map(normalize, words)) + +def books(dept, code, withPrices): + """ + Snatch me up a book title or three + """ + req = searchUrl % (dept, code) + + html = requests.get(req).text + + parsed = lxh.fromstring(html) + + pricelist = prices(parsed) + + for div in parsed.xpath(".//div"): + if (div.attrib.has_key("id") and + "prodDesc" in div.attrib["id"]): + + textbook = div.text_content() + author = sub(r',', '', + "".join( + (div.getparent() + .xpath(".//span[@class='inline']") + [0].text_content() + .split(":")[1:])).strip()) + price = pricelist.pop() + if withPrices: + yield (normwords(textbook), normwords(author), repr(price)) + else: + yield (normwords(textbook), normwords(author)) + +def prices(html): + """ + Get the prices from a search result page + """ + ps = [ + p.getparent().text_content().split()[0] + for p in html.xpath("//p/input[@type='checkbox']") + ] + + try: + amts, stats = zip(*list(reversed(list(grouper(2, ps))))) + return map(Price, amts, stats) + except ValueError: + return [] + +def textbookInfo(dept, code, withPrices=False): + """ + Return all the textbooks for a course + """ + return list(books(dept, code, withPrices)) + +def humanities(): + """ + Download humanities syllabi + """ + return [] + +# Example, getting the course info for Personality Theory (PSYCH = Department, 2B03 = Course code) +# print list(courseInfo("PSYCH", "2B03")) diff --git a/openlibrary.py b/openlibrary.py new file mode 100755 index 0000000..d558c21 --- /dev/null +++ b/openlibrary.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" +searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' + +def bookUrls(title, author): + print title, author + if ":" in title: + title = title.split(":")[0] + requrl = searchurl % (quote(author), quote(title)) + results = loads(req.get(requrl).text) + for result in results["docs"][0:2]: + if result.has_key("edition_key"): + yield "https://openlibrary.org/books/%s" % result["edition_key"][0] + +# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' + +#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): + #print book diff --git a/predictions.py b/predictions.py new file mode 100755 index 0000000..b770a0b --- /dev/null +++ b/predictions.py @@ -0,0 +1,153 @@ +##! /usr/bin/python2 +from itertools import groupby, chain +from sys import stdout +from functools import partial +from json import dumps + +def gensymer(): + n = [0] + def inner(): + result = str(n[0]) + n[0] += 1 + return result + return inner + +gensym = gensymer() + +def printTrie(graph, prev, trie, weight): + new_node = str(gensym()) + graph.node(new_node, "%s" % trie.letter) + graph.edge(prev, new_node, label="%.2f" % weight) + if not trie.children: + return + for child, weight in zip(trie.children, trie.ws): + printTrie(graph, new_node, child, weight) + + +class Trie(object): + def __init__(self, letter, children, ws): + self.letter = letter + self.children = children + self.ws = ws + +def probweight(suffixes): + weights = [float(s["value"]) for s in suffixes] + s = float(sum(weights)) + ws = [w/s for w in weights] + return ws + +def buildtrie(trie, suffixes): + """ + Build a trie, also known as a prefix tree, of all the possible completions + """ + trie.children = [] + for letter, suffs in suffixes: + ped = partition(suffs) + if any(map(lambda p: p[0], ped)): + # check if there are any children + trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) + else: + # we've reached the end of this word so just include the final letter + # [1] = there is a probability of 1 of reaching this single leaf node, + # since it is the only possible completion here + trie.children.append(Trie(letter, [], [1])) + return trie + + +def keyf(x): + if not x["key"]: + return "" + return x["key"][0] + +def tails(words): + for word in words: + yield { + "key" : word["key"][1:], + "value" : word["value"] + } + +def partition(words): + """ + Partition the words into different prefixes based on the first character + """ + groups = [ + (g[0], list(tails(g[1]))) + for g in groupby( + sorted(words, key=keyf), + key=keyf) + ] + return groups + + +def flatten_helper(letter, trie): + return ([letter + child.letter for + child in trie.children], trie.children) + +def flatten(trie): + if not trie.children: + return trie.letter + prefixes, suffixes = flatten_helper(trie.letter, trie) + return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] + +def flattenlist(xs): + locs = [] + for x in xs: + if not isinstance(x, list): + locs.append(x) + else: + locs.extend(flattenlist(x)) + return locs + +def matchc(trie, prefix): + c = None + if len(prefix) > 1: + c = prefix[0] + else: + c = prefix + return [ch for ch in trie.children if ch.letter == c] + +def match(trie, word): + if not word: + return [] + m = matchc(trie, word[0]) + if not m: + return [] + else: + return [m[0]] + match(m[0], word[1:]) + +def complete(trie, word): + m = match(trie, word) + if len(word) != len(m): + return False + completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] + if len(completions) > 10: + return dumps(completions[0:10]) + return dumps(completions) + +def sortTrie(trie): + """ + Sort the children of each node in descending order + of the probability that each child would be the completion + of whatever that word is + """ + if not trie.children: + return + sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) + trie.children = [x[0] for x in sortedChilds] + trie.ws = [x[1] for x in sortedChilds] + for child in trie.children: + sortTrie(child) + +def toTrie(words): + for word in words: + word["key"] = word["key"].lower() + trie = buildtrie(Trie("", [], [1]), partition(words)) + trie.ws = [1]*len(trie.children) + sortTrie(trie) + return trie + +def testkey(w): + return { + "key" : w, + "value" : "1" + } diff --git a/schemadsl.rkt b/schemadsl.rkt index 8308b40..667f371 100644 --- a/schemadsl.rkt +++ b/schemadsl.rkt @@ -1,6 +1,5 @@ #lang racket -; This file is used to generate the elasticsearch mapping -; It is written in Racket (a dialect of Scheme) + (require json) (define (root name type) diff --git a/search.py b/search.py new file mode 100755 index 0000000..777222f --- /dev/null +++ b/search.py @@ -0,0 +1,237 @@ +#! /usr/bin/python2 + +import elasticsearch + +from elasticsearch_dsl import FacetedSearch, Search, Q +from elasticsearch_dsl.aggs import Terms, DateHistogram +from sys import exit, stderr +from json import dumps, loads +from itertools import chain, imap + +from hashlib import sha1 + +from textbookExceptions import UnIndexable + +from mcmaster.classes import allCourses + +# Generic instance of elasticsearch right now +es = elasticsearch.Elasticsearch() + +def summarize(text): + splitted = text.split(" ") + if len(splitted) > 4: + return " ".join(splitted[0:4]) + ".." + return text + +def sectionToJSON(section): + return { + "prof" : section.prof, + "sem" : section.sem, + "day" : section.day + } + +def classToJSON(clss): + return { + "title" : clss.title, + "sections" : map(sectionToJSON, clss.sections), + "dept" : clss.dept, + "code" : clss.code, + "books" : list(clss.books) if clss.books else [] + } + + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def hashsec(course): + """ + Hash a course into a usable id + """ + if not course["code"]: + code = "" + else: + code = course["code"] + if not course["title"]: + title = "" + else: + title = course["title"] + + if not course["sections"] or len(course["sections"]) < 1: + course["sections"][0] = "" + + if not (code or title): + raise UnIndexable(course) + + h = sha1() + h.update(code + title + course["sections"][0]["sem"]) + return int(h.hexdigest(), 16) + +def createIndex(name): + """ + This creates a new index in elasticsearch + An index is like a schema in a regular database + Create an elasticsearch index + + """ + indices = elasticsearch.client.IndicesClient(es) + + print indices.create(name) + with open("./course.json", "r") as mapping: + print indices.put_mapping("course", loads(mapping.read()), name) + +def indexListing(course): + """ + Index a specific course in the database (using the courses index) + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + + """ + courseID = hashsec(course) + print es.index(index="oersearch", + doc_type="course", + id=courseID, + body=course) + + # For every course we index, we also create a resource for it + # This should be an idempotent operation because we're putting it in couchdb + # And we're using the id obtained from the hash function, so it should just update the document + # no need to delete anything + #try: + #courseDept = course[0]["title"].strip().split(" ")[0].strip() + #courseCode = course[0]["title"].strip().split(" ")[1].strip() + #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) + #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) + #except: + #print "Couldn't create the resource associated with %s" % course + +def termSearch(field): + """ + Make a term search (exact match) + """ + def t(term): + q = Q("term", + **{ + "sections."+field : term + }) + return q + return t + +def search(field): + """ + Make a match search + """ + def s(term): + q = Q("match", + **{ + field : term + }) + return q + return s + +def join(x, y): + """ + Join two queries + """ + return x & y + +def filterSections(secs): + """ + Get rid of tutorial sections + because they almost always have "Staff" as the instructor + This is just a heuristic of course + """ + filtered = [s for s in secs.sections if "Staff" not in s.prof] + if len(filtered) > 0: + return filtered + return False + +def searchTerms(terms): + """ + Run a search for courses + """ + + # A list of all the queries we want to run + qs = [searchers[field](term) for + field, term in + terms.iteritems() if + term and searchers.has_key(field)] + + if not qs: + # No queries = no results + return dumps([]) + + # Reduce joins all of the queries into one query + # It will search for the conjunction of all of them + # So that means it cares about each query equally + q = reduce(join, qs) + + s = (Search(using=es, index="oersearch") + .query(q))[0:100] # only return up to 100 results for now + + results = s.execute() + + filtered = [ + (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials + for secs in results + if filterSections(secs) + ] + results = [] + for obj, secs in filtered: + # Add the truncated course id + # This is used to point to the resource page for that course + secs["id"] = truncate(obj.meta.id) + secs["title"] = obj.title + if obj["dept"] not in secs["title"]: + secs["dept"] = obj.dept + if obj.books: + secs["books"] = [ + { + "booktitle" : summarize(book[0].encode("ASCII")), + "bookauthor" : book[1].encode("ASCII"), + "bookprice" : book[2].encode("ASCII") + } + for book in obj.books + ] + else: + secs["books"] = "" + results.append(secs) + + return dumps(results) + + +searchers = { + "title" : search("title"), + "loc" : search("loc"), + "time" : search("time"), + "prof" : search("prof"), + "day" : search("day"), + } + +#print searchTerms({"title" : "PHILOS"}) + +#for c in imap(classToJSON, allCourses()): + #try: + #print indexListing(c) + #except UnIndexable as e: diff --git a/src/archive.py b/src/archive.py new file mode 100755 index 0000000..73fcde7 --- /dev/null +++ b/src/archive.py @@ -0,0 +1,34 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" + +def searchIA(title, author): + """ + Do a search on The Internet Archive for a book + """ + print "running a search" + requrl = searchUrl.format(quote(title + " " + author)) + try: + results = loads(req.get(requrl).text[9:][0:-1]) + except ValueError: + return [] + + rownum = results["responseHeader"]["params"]["rows"] + if rownum < 1: + print "Couldn't find results for %s %s" % (title, author) + return [] + docs = results["response"]["docs"] + urls = [] + for result in results["response"]["docs"][0:3]: + urls.append("https://archive.org/details/%s" % result["identifier"]) + return urls + + +# Example, search for David Hume's Enquiry Concerning Human Understanding +#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): + #print url diff --git a/src/database.py b/src/database.py new file mode 100755 index 0000000..a19272c --- /dev/null +++ b/src/database.py @@ -0,0 +1,62 @@ +#! /usr/bin/python2 + +from sys import argv +from hashlib import sha1 + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def createResource(textbookInfo, course, dept, coursecode, docid): + """ + Create a document associated with a course + This document contains any/all resources associated + with that course + + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + """ + textbooks = textbookInfo(dept.strip(), coursecode.strip()) + + # We truncate the id so we can have nicer looking URLs + # Since the id will be used to point to the resource page for that course + _id = str(truncate(docid)) + + fields = { + "_id" : _id, + "textbooks" : textbooks, + "coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), + "courseinfo" : course + #"Syllabus" : "blah" + } + try: + revisions = list(localdb.revisions(_id)) + if not revisions: + return localdb.save(fields) + else: + rev = dict(revisions[0])["_rev"] + fields["_rev"] = rev + return localdb.save(fields) + except ResourceConflict: + print "Resource for %s already exists, not creating a new one" % (docid) diff --git a/src/goasearch.py b/src/goasearch.py new file mode 100755 index 0000000..3dca7eb --- /dev/null +++ b/src/goasearch.py @@ -0,0 +1,14 @@ +#! /usr/bin/python2 + +# predictive data +# switch to elasticsearch's prediction + + + +import database +import predictions + +class GOASearch(object): + def __init__(self): + return self + diff --git a/src/openlibrary.py b/src/openlibrary.py new file mode 100755 index 0000000..d558c21 --- /dev/null +++ b/src/openlibrary.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +from urllib import quote +from json import loads, dumps + +import requests as req + +#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" +searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' + +def bookUrls(title, author): + print title, author + if ":" in title: + title = title.split(":")[0] + requrl = searchurl % (quote(author), quote(title)) + results = loads(req.get(requrl).text) + for result in results["docs"][0:2]: + if result.has_key("edition_key"): + yield "https://openlibrary.org/books/%s" % result["edition_key"][0] + +# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' + +#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): + #print book diff --git a/src/predictions.py b/src/predictions.py new file mode 100755 index 0000000..b770a0b --- /dev/null +++ b/src/predictions.py @@ -0,0 +1,153 @@ +##! /usr/bin/python2 +from itertools import groupby, chain +from sys import stdout +from functools import partial +from json import dumps + +def gensymer(): + n = [0] + def inner(): + result = str(n[0]) + n[0] += 1 + return result + return inner + +gensym = gensymer() + +def printTrie(graph, prev, trie, weight): + new_node = str(gensym()) + graph.node(new_node, "%s" % trie.letter) + graph.edge(prev, new_node, label="%.2f" % weight) + if not trie.children: + return + for child, weight in zip(trie.children, trie.ws): + printTrie(graph, new_node, child, weight) + + +class Trie(object): + def __init__(self, letter, children, ws): + self.letter = letter + self.children = children + self.ws = ws + +def probweight(suffixes): + weights = [float(s["value"]) for s in suffixes] + s = float(sum(weights)) + ws = [w/s for w in weights] + return ws + +def buildtrie(trie, suffixes): + """ + Build a trie, also known as a prefix tree, of all the possible completions + """ + trie.children = [] + for letter, suffs in suffixes: + ped = partition(suffs) + if any(map(lambda p: p[0], ped)): + # check if there are any children + trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) + else: + # we've reached the end of this word so just include the final letter + # [1] = there is a probability of 1 of reaching this single leaf node, + # since it is the only possible completion here + trie.children.append(Trie(letter, [], [1])) + return trie + + +def keyf(x): + if not x["key"]: + return "" + return x["key"][0] + +def tails(words): + for word in words: + yield { + "key" : word["key"][1:], + "value" : word["value"] + } + +def partition(words): + """ + Partition the words into different prefixes based on the first character + """ + groups = [ + (g[0], list(tails(g[1]))) + for g in groupby( + sorted(words, key=keyf), + key=keyf) + ] + return groups + + +def flatten_helper(letter, trie): + return ([letter + child.letter for + child in trie.children], trie.children) + +def flatten(trie): + if not trie.children: + return trie.letter + prefixes, suffixes = flatten_helper(trie.letter, trie) + return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] + +def flattenlist(xs): + locs = [] + for x in xs: + if not isinstance(x, list): + locs.append(x) + else: + locs.extend(flattenlist(x)) + return locs + +def matchc(trie, prefix): + c = None + if len(prefix) > 1: + c = prefix[0] + else: + c = prefix + return [ch for ch in trie.children if ch.letter == c] + +def match(trie, word): + if not word: + return [] + m = matchc(trie, word[0]) + if not m: + return [] + else: + return [m[0]] + match(m[0], word[1:]) + +def complete(trie, word): + m = match(trie, word) + if len(word) != len(m): + return False + completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] + if len(completions) > 10: + return dumps(completions[0:10]) + return dumps(completions) + +def sortTrie(trie): + """ + Sort the children of each node in descending order + of the probability that each child would be the completion + of whatever that word is + """ + if not trie.children: + return + sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) + trie.children = [x[0] for x in sortedChilds] + trie.ws = [x[1] for x in sortedChilds] + for child in trie.children: + sortTrie(child) + +def toTrie(words): + for word in words: + word["key"] = word["key"].lower() + trie = buildtrie(Trie("", [], [1]), partition(words)) + trie.ws = [1]*len(trie.children) + sortTrie(trie) + return trie + +def testkey(w): + return { + "key" : w, + "value" : "1" + } diff --git a/src/search.py b/src/search.py new file mode 100755 index 0000000..777222f --- /dev/null +++ b/src/search.py @@ -0,0 +1,237 @@ +#! /usr/bin/python2 + +import elasticsearch + +from elasticsearch_dsl import FacetedSearch, Search, Q +from elasticsearch_dsl.aggs import Terms, DateHistogram +from sys import exit, stderr +from json import dumps, loads +from itertools import chain, imap + +from hashlib import sha1 + +from textbookExceptions import UnIndexable + +from mcmaster.classes import allCourses + +# Generic instance of elasticsearch right now +es = elasticsearch.Elasticsearch() + +def summarize(text): + splitted = text.split(" ") + if len(splitted) > 4: + return " ".join(splitted[0:4]) + ".." + return text + +def sectionToJSON(section): + return { + "prof" : section.prof, + "sem" : section.sem, + "day" : section.day + } + +def classToJSON(clss): + return { + "title" : clss.title, + "sections" : map(sectionToJSON, clss.sections), + "dept" : clss.dept, + "code" : clss.code, + "books" : list(clss.books) if clss.books else [] + } + + +def truncate(docid): + """ + Truncate a document id to 12 digits + The document ID should be based on a + hash of unique identifiers + """ + return int(str(docid)[0:12]) + +def hashsec(course): + """ + Hash a course into a usable id + """ + if not course["code"]: + code = "" + else: + code = course["code"] + if not course["title"]: + title = "" + else: + title = course["title"] + + if not course["sections"] or len(course["sections"]) < 1: + course["sections"][0] = "" + + if not (code or title): + raise UnIndexable(course) + + h = sha1() + h.update(code + title + course["sections"][0]["sem"]) + return int(h.hexdigest(), 16) + +def createIndex(name): + """ + This creates a new index in elasticsearch + An index is like a schema in a regular database + Create an elasticsearch index + + """ + indices = elasticsearch.client.IndicesClient(es) + + print indices.create(name) + with open("./course.json", "r") as mapping: + print indices.put_mapping("course", loads(mapping.read()), name) + +def indexListing(course): + """ + Index a specific course in the database (using the courses index) + example, + { + 'books': [], + 'dept': 'COLLAB', + 'code': '2C03', + 'sections': [ + { + 'prof': 'Lisa Pender', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Mo' + }, + { + 'prof': 'Staff', + 'sem': '2015/09/08 - 2015/12/08', + 'day': 'Th' + } + ], + 'title': 'COLLAB 2C03 - Sociology I' + } + + """ + courseID = hashsec(course) + print es.index(index="oersearch", + doc_type="course", + id=courseID, + body=course) + + # For every course we index, we also create a resource for it + # This should be an idempotent operation because we're putting it in couchdb + # And we're using the id obtained from the hash function, so it should just update the document + # no need to delete anything + #try: + #courseDept = course[0]["title"].strip().split(" ")[0].strip() + #courseCode = course[0]["title"].strip().split(" ")[1].strip() + #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) + #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) + #except: + #print "Couldn't create the resource associated with %s" % course + +def termSearch(field): + """ + Make a term search (exact match) + """ + def t(term): + q = Q("term", + **{ + "sections."+field : term + }) + return q + return t + +def search(field): + """ + Make a match search + """ + def s(term): + q = Q("match", + **{ + field : term + }) + return q + return s + +def join(x, y): + """ + Join two queries + """ + return x & y + +def filterSections(secs): + """ + Get rid of tutorial sections + because they almost always have "Staff" as the instructor + This is just a heuristic of course + """ + filtered = [s for s in secs.sections if "Staff" not in s.prof] + if len(filtered) > 0: + return filtered + return False + +def searchTerms(terms): + """ + Run a search for courses + """ + + # A list of all the queries we want to run + qs = [searchers[field](term) for + field, term in + terms.iteritems() if + term and searchers.has_key(field)] + + if not qs: + # No queries = no results + return dumps([]) + + # Reduce joins all of the queries into one query + # It will search for the conjunction of all of them + # So that means it cares about each query equally + q = reduce(join, qs) + + s = (Search(using=es, index="oersearch") + .query(q))[0:100] # only return up to 100 results for now + + results = s.execute() + + filtered = [ + (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials + for secs in results + if filterSections(secs) + ] + results = [] + for obj, secs in filtered: + # Add the truncated course id + # This is used to point to the resource page for that course + secs["id"] = truncate(obj.meta.id) + secs["title"] = obj.title + if obj["dept"] not in secs["title"]: + secs["dept"] = obj.dept + if obj.books: + secs["books"] = [ + { + "booktitle" : summarize(book[0].encode("ASCII")), + "bookauthor" : book[1].encode("ASCII"), + "bookprice" : book[2].encode("ASCII") + } + for book in obj.books + ] + else: + secs["books"] = "" + results.append(secs) + + return dumps(results) + + +searchers = { + "title" : search("title"), + "loc" : search("loc"), + "time" : search("time"), + "prof" : search("prof"), + "day" : search("day"), + } + +#print searchTerms({"title" : "PHILOS"}) + +#for c in imap(classToJSON, allCourses()): + #try: + #print indexListing(c) + #except UnIndexable as e: diff --git a/src/textbookExceptions.py b/src/textbookExceptions.py new file mode 100644 index 0000000..999ff3e --- /dev/null +++ b/src/textbookExceptions.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +class UnIndexable(Exception): + def __init__(self, course): + self.course = course + + @property + def reason(self): + course = self.course + if not course["code"] and not course["title"]: + message = "there was no course code and no title defined" + if not course["code"]: + message = "there was no course code defined" + if not course["title"]: + message = "there was no course title defined" + if not course["sections"]: + message = "there were no sections defined" + return """ + There was a problem with indexing this course. + %s + There could be several reasons why, my best guess is that %s + We need at least the course code, title, and one or more sections to index + + """ % (course, message) diff --git a/src/visualize.py b/src/visualize.py new file mode 100755 index 0000000..b46a67d --- /dev/null +++ b/src/visualize.py @@ -0,0 +1,97 @@ +#! /usr/bin/python2 + +from json import loads, load +from re import sub, split +from itertools import groupby +from numpy import mean +from operator import attrgetter + +import pygal +import csv + +class Textbook(object): + def __init__(self, dept, code, title, author, price): + self.dept = dept + self.code = code + self.title = title + self.author = author + self.price = float(price) + + def __repr__(self): + return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, + self.code, + self.title, + self.author, + self.price) + + +def courses(): + with open("./books.csv", "r") as books: + booksreader = csv.reader(books) + for row in booksreader: + yield row + + +def groupDept(courselist): + sortedCourses = sorted(courselist, key=attrgetter("dept")) + for course in groupby(sortedCourses, attrgetter("dept")): + yield course[0], list(course[1]) + +def meanPrice(books): + return mean([book.price for book in books]) + +# Questions, +# mean cost per department +# mean cost per faculty +# mean difference between book store copies and other copies per dept and faculty +# number of overlapping books per faculty, do eng students benefit from that? + +# maybe a survey for students to see how often they buy books from other sources +# correlate with how much they could be saving? + +facultyDesc = { + "hum" : "Humanities", + "bus" : "Business", + "hlth" : "Health Science", + "eng" : "Engineering", + "sci" : "Science", + "socsci" : "Social Sciences", + "artsci" : "Arts & Sciences", + "meld" : "MELD" +} + +faculties = load(open("./faculties.json")) + +def categorize(dept): + # faculties + return facultyDesc.get(faculties.get(dept, False), False) + +def byFaculty(): + for dept, books in groupDept(courses()): + yield (categorize(dept), dept, books) + +def meanFacultyCosts(): + byfac = list(byFaculty()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by faculty" + sortedFacs = sorted(byfac, key=lambda x: x[0]) + for fac in groupby(sortedFacs, lambda x: x[0]): + graph.add(fac[0], meanPrice(list(fac[1])[0][2])) + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render(transpose=True) + +def meanCosts(): + cs = groupDept(courses()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by department" + for c in cs: + dept, books = c + graph.add(dept, meanPrice(books)) + #graph.render_to_file("./test_graph.svg") + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render_table(style=True, transpose=True) + +for x in courses(): + print x +#print meanCosts() +#print meanFacultyCosts() diff --git a/src/website.py b/src/website.py new file mode 100755 index 0000000..1fc9374 --- /dev/null +++ b/src/website.py @@ -0,0 +1,148 @@ +#! /usr/bin/python2 +from functools import partial +from couchdb import ResourceConflict + +from flask import Flask, render_template, flash, request, send_from_directory +from flask_bootstrap import Bootstrap +from flask_appconfig import AppConfig +from urllib import unquote +from search import searchTerms + +from openlibrary import bookUrls + +from archive import searchIA +from urllib import quote, unquote +from json import dumps, loads + +from werkzeug.contrib.cache import MemcachedCache +cache = MemcachedCache(['127.0.0.1:11211']) + +import os + +def predict(fieldtype, term): + print fieldtype + print term + if not term: + return "[]" + else: + try: + cs = completers[fieldtype](term.lower()) + except KeyError: + return "[]" + if cs: + return cs + return "[]" + +def predictor(fieldtype): + def inner(request): + params = dict(request.args.items()) + return predict(fieldtype, params["term"]) + return inner + +def cacheit(key, thunk): + """ + Tries to find a cached version of ``key'' + If there is no cached version then it will + evaluate thunk (which must be a generator) + and cache that, then return the result + """ + cached = cache.get(quote(key)) + if cached is None: + result = list(thunk()) + cache.set(quote(key), result) + return result + return cached + +def ClassSearch(configfile=None): + defaults = {"Day", "Building", "Exact Location", "Department"} + app = Flask(__name__) + AppConfig(app, configfile) # Flask-Appconfig is not necessary, but + # highly recommend =) + # https://github.com/mbr/flask-appconfig + Bootstrap(app) + + app.config["scripts"] = "/home/wes/MGOAL/scripts" + app.config["styles"] = "/home/wes/MGOAL/styles" + + @app.route('/favicon.ico') + def favicon(): + return send_from_directory("/srv/http/goal/favicon.ico", + 'favicon.ico', mimetype='image/vnd.microsoft.icon') + + + @app.route("/buildpred", methods=("GET", "POST")) + def buildpred(): + return predictbuild(request) + + @app.route("/locpred", methods=("GET", "POST")) + def locpred(): + return predictloc(request) + + @app.route("/daypred", methods=("GET", "POST")) + def daypred(): + return predictday(request) + + @app.route("/deptpred", methods=("GET", "POST")) + def deptpred(): + return predictdept(request) + + @app.route("/titlepred", methods=("GET", "POST")) + def titlepred(): + return predicttitle(request) + + @app.route("/", methods=("GET", "POST")) + def index(): + return render_template("search.html") + + @app.route("/fc", methods=("GET", "POST")) + def fc(): + """ Filter Courses """ + print "trying to get courses" + params = dict(request.args.items()) + for key, val in params.iteritems(): + if val in defaults: + del params[key] + results = searchTerms(params) + return results + + @app.route("/resources", methods=("GET", "POST")) + def resources(): + """ Get Resources """ + notRequired = False + params = loads(dict(request.args.items())["data"]) + print params + author = params["author"] + title = params["title"] + + if ("No Textbooks" in title or + "No Adoption" in title): + return dumps("false") + + # Cache the result of the open library search + openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) + print openlib + + # cache the result of an internet archive search + iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) + print iarchive + + if not (any(openlib) or any(iarchive)): + # We literally could not find ANYTHING + return dumps("false") + + return dumps({ + "iarchive" : iarchive, + "openlib" : openlib + }) + + @app.route("/scripts/") + def send_script(filename): + return send_from_directory(app.config["scripts"], filename) + + @app.route("/styles/") + def send_style(filename): + return send_from_directory(app.config["styles"], filename) + return app + +if __name__ == "__main__": + ClassSearch().run(port=8001, debug=True) diff --git a/textbookExceptions.py b/textbookExceptions.py new file mode 100644 index 0000000..999ff3e --- /dev/null +++ b/textbookExceptions.py @@ -0,0 +1,24 @@ +#! /usr/bin/python2 + +class UnIndexable(Exception): + def __init__(self, course): + self.course = course + + @property + def reason(self): + course = self.course + if not course["code"] and not course["title"]: + message = "there was no course code and no title defined" + if not course["code"]: + message = "there was no course code defined" + if not course["title"]: + message = "there was no course title defined" + if not course["sections"]: + message = "there were no sections defined" + return """ + There was a problem with indexing this course. + %s + There could be several reasons why, my best guess is that %s + We need at least the course code, title, and one or more sections to index + + """ % (course, message) diff --git a/visualize.py b/visualize.py new file mode 100755 index 0000000..b46a67d --- /dev/null +++ b/visualize.py @@ -0,0 +1,97 @@ +#! /usr/bin/python2 + +from json import loads, load +from re import sub, split +from itertools import groupby +from numpy import mean +from operator import attrgetter + +import pygal +import csv + +class Textbook(object): + def __init__(self, dept, code, title, author, price): + self.dept = dept + self.code = code + self.title = title + self.author = author + self.price = float(price) + + def __repr__(self): + return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, + self.code, + self.title, + self.author, + self.price) + + +def courses(): + with open("./books.csv", "r") as books: + booksreader = csv.reader(books) + for row in booksreader: + yield row + + +def groupDept(courselist): + sortedCourses = sorted(courselist, key=attrgetter("dept")) + for course in groupby(sortedCourses, attrgetter("dept")): + yield course[0], list(course[1]) + +def meanPrice(books): + return mean([book.price for book in books]) + +# Questions, +# mean cost per department +# mean cost per faculty +# mean difference between book store copies and other copies per dept and faculty +# number of overlapping books per faculty, do eng students benefit from that? + +# maybe a survey for students to see how often they buy books from other sources +# correlate with how much they could be saving? + +facultyDesc = { + "hum" : "Humanities", + "bus" : "Business", + "hlth" : "Health Science", + "eng" : "Engineering", + "sci" : "Science", + "socsci" : "Social Sciences", + "artsci" : "Arts & Sciences", + "meld" : "MELD" +} + +faculties = load(open("./faculties.json")) + +def categorize(dept): + # faculties + return facultyDesc.get(faculties.get(dept, False), False) + +def byFaculty(): + for dept, books in groupDept(courses()): + yield (categorize(dept), dept, books) + +def meanFacultyCosts(): + byfac = list(byFaculty()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by faculty" + sortedFacs = sorted(byfac, key=lambda x: x[0]) + for fac in groupby(sortedFacs, lambda x: x[0]): + graph.add(fac[0], meanPrice(list(fac[1])[0][2])) + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render(transpose=True) + +def meanCosts(): + cs = groupDept(courses()) + graph = pygal.Bar() + graph.title = "Mean textbook cost by department" + for c in cs: + dept, books = c + graph.add(dept, meanPrice(books)) + #graph.render_to_file("./test_graph.svg") + graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" + return graph.render_table(style=True, transpose=True) + +for x in courses(): + print x +#print meanCosts() +#print meanFacultyCosts() diff --git a/website.py b/website.py new file mode 100755 index 0000000..1fc9374 --- /dev/null +++ b/website.py @@ -0,0 +1,148 @@ +#! /usr/bin/python2 +from functools import partial +from couchdb import ResourceConflict + +from flask import Flask, render_template, flash, request, send_from_directory +from flask_bootstrap import Bootstrap +from flask_appconfig import AppConfig +from urllib import unquote +from search import searchTerms + +from openlibrary import bookUrls + +from archive import searchIA +from urllib import quote, unquote +from json import dumps, loads + +from werkzeug.contrib.cache import MemcachedCache +cache = MemcachedCache(['127.0.0.1:11211']) + +import os + +def predict(fieldtype, term): + print fieldtype + print term + if not term: + return "[]" + else: + try: + cs = completers[fieldtype](term.lower()) + except KeyError: + return "[]" + if cs: + return cs + return "[]" + +def predictor(fieldtype): + def inner(request): + params = dict(request.args.items()) + return predict(fieldtype, params["term"]) + return inner + +def cacheit(key, thunk): + """ + Tries to find a cached version of ``key'' + If there is no cached version then it will + evaluate thunk (which must be a generator) + and cache that, then return the result + """ + cached = cache.get(quote(key)) + if cached is None: + result = list(thunk()) + cache.set(quote(key), result) + return result + return cached + +def ClassSearch(configfile=None): + defaults = {"Day", "Building", "Exact Location", "Department"} + app = Flask(__name__) + AppConfig(app, configfile) # Flask-Appconfig is not necessary, but + # highly recommend =) + # https://github.com/mbr/flask-appconfig + Bootstrap(app) + + app.config["scripts"] = "/home/wes/MGOAL/scripts" + app.config["styles"] = "/home/wes/MGOAL/styles" + + @app.route('/favicon.ico') + def favicon(): + return send_from_directory("/srv/http/goal/favicon.ico", + 'favicon.ico', mimetype='image/vnd.microsoft.icon') + + + @app.route("/buildpred", methods=("GET", "POST")) + def buildpred(): + return predictbuild(request) + + @app.route("/locpred", methods=("GET", "POST")) + def locpred(): + return predictloc(request) + + @app.route("/daypred", methods=("GET", "POST")) + def daypred(): + return predictday(request) + + @app.route("/deptpred", methods=("GET", "POST")) + def deptpred(): + return predictdept(request) + + @app.route("/titlepred", methods=("GET", "POST")) + def titlepred(): + return predicttitle(request) + + @app.route("/", methods=("GET", "POST")) + def index(): + return render_template("search.html") + + @app.route("/fc", methods=("GET", "POST")) + def fc(): + """ Filter Courses """ + print "trying to get courses" + params = dict(request.args.items()) + for key, val in params.iteritems(): + if val in defaults: + del params[key] + results = searchTerms(params) + return results + + @app.route("/resources", methods=("GET", "POST")) + def resources(): + """ Get Resources """ + notRequired = False + params = loads(dict(request.args.items())["data"]) + print params + author = params["author"] + title = params["title"] + + if ("No Textbooks" in title or + "No Adoption" in title): + return dumps("false") + + # Cache the result of the open library search + openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) + print openlib + + # cache the result of an internet archive search + iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) + print iarchive + + if not (any(openlib) or any(iarchive)): + # We literally could not find ANYTHING + return dumps("false") + + return dumps({ + "iarchive" : iarchive, + "openlib" : openlib + }) + + @app.route("/scripts/") + def send_script(filename): + return send_from_directory(app.config["scripts"], filename) + + @app.route("/styles/") + def send_style(filename): + return send_from_directory(app.config["styles"], filename) + return app + +if __name__ == "__main__": + ClassSearch().run(port=8001, debug=True)