24 changed files with 2062 additions and 8 deletions
@ -0,0 +1,34 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from urllib import quote |
||||
|
from json import loads, dumps |
||||
|
|
||||
|
import requests as req |
||||
|
|
||||
|
searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" |
||||
|
|
||||
|
def searchIA(title, author): |
||||
|
""" |
||||
|
Do a search on The Internet Archive for a book |
||||
|
""" |
||||
|
print "running a search" |
||||
|
requrl = searchUrl.format(quote(title + " " + author)) |
||||
|
try: |
||||
|
results = loads(req.get(requrl).text[9:][0:-1]) |
||||
|
except ValueError: |
||||
|
return [] |
||||
|
|
||||
|
rownum = results["responseHeader"]["params"]["rows"] |
||||
|
if rownum < 1: |
||||
|
print "Couldn't find results for %s %s" % (title, author) |
||||
|
return [] |
||||
|
docs = results["response"]["docs"] |
||||
|
urls = [] |
||||
|
for result in results["response"]["docs"][0:3]: |
||||
|
urls.append("https://archive.org/details/%s" % result["identifier"]) |
||||
|
return urls |
||||
|
|
||||
|
|
||||
|
# Example, search for David Hume's Enquiry Concerning Human Understanding |
||||
|
#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): |
||||
|
#print url |
@ -0,0 +1,62 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from sys import argv |
||||
|
from hashlib import sha1 |
||||
|
|
||||
|
def truncate(docid): |
||||
|
""" |
||||
|
Truncate a document id to 12 digits |
||||
|
The document ID should be based on a |
||||
|
hash of unique identifiers |
||||
|
""" |
||||
|
return int(str(docid)[0:12]) |
||||
|
|
||||
|
def createResource(textbookInfo, course, dept, coursecode, docid): |
||||
|
""" |
||||
|
Create a document associated with a course |
||||
|
This document contains any/all resources associated |
||||
|
with that course |
||||
|
|
||||
|
example, |
||||
|
{ |
||||
|
'books': [], |
||||
|
'dept': 'COLLAB', |
||||
|
'code': '2C03', |
||||
|
'sections': [ |
||||
|
{ |
||||
|
'prof': 'Lisa Pender', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Mo' |
||||
|
}, |
||||
|
{ |
||||
|
'prof': 'Staff', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Th' |
||||
|
} |
||||
|
], |
||||
|
'title': 'COLLAB 2C03 - Sociology I' |
||||
|
} |
||||
|
""" |
||||
|
textbooks = textbookInfo(dept.strip(), coursecode.strip()) |
||||
|
|
||||
|
# We truncate the id so we can have nicer looking URLs |
||||
|
# Since the id will be used to point to the resource page for that course |
||||
|
_id = str(truncate(docid)) |
||||
|
|
||||
|
fields = { |
||||
|
"_id" : _id, |
||||
|
"textbooks" : textbooks, |
||||
|
"coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), |
||||
|
"courseinfo" : course |
||||
|
#"Syllabus" : "blah" |
||||
|
} |
||||
|
try: |
||||
|
revisions = list(localdb.revisions(_id)) |
||||
|
if not revisions: |
||||
|
return localdb.save(fields) |
||||
|
else: |
||||
|
rev = dict(revisions[0])["_rev"] |
||||
|
fields["_rev"] = rev |
||||
|
return localdb.save(fields) |
||||
|
except ResourceConflict: |
||||
|
print "Resource for %s already exists, not creating a new one" % (docid) |
@ -0,0 +1,14 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
# predictive data |
||||
|
# switch to elasticsearch's prediction |
||||
|
|
||||
|
|
||||
|
|
||||
|
import database |
||||
|
import predictions |
||||
|
|
||||
|
class GOASearch(object): |
||||
|
def __init__(self): |
||||
|
return self |
||||
|
|
@ -0,0 +1,349 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from sys import argv |
||||
|
from itertools import chain, islice, izip as zip |
||||
|
from re import search, sub |
||||
|
from functools import total_ordering |
||||
|
|
||||
|
from sylla import textbookInfo |
||||
|
from collections import MutableMapping |
||||
|
|
||||
|
import datetime as dt |
||||
|
import lxml.html as lxh |
||||
|
import requests |
||||
|
import sys |
||||
|
import copy |
||||
|
|
||||
|
fall = "2159" |
||||
|
spring_summer = "2165" |
||||
|
winter = "2161" |
||||
|
|
||||
|
# threading stuff |
||||
|
import Queue as q |
||||
|
import threading as thd |
||||
|
|
||||
|
baseurl = "https://applicants.mcmaster.ca/psp/prepprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" |
||||
|
|
||||
|
searchurl = "https://csprd.mcmaster.ca/psc/prcsprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" |
||||
|
|
||||
|
custom_headers = { |
||||
|
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0", |
||||
|
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8", |
||||
|
} |
||||
|
|
||||
|
courseCodes1 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_SUBJ_SRCH%240&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=5tq9x%2Fjt42mf62Sh5z%2BrjxT0gT15kiIyQ2cecCSmRB4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" |
||||
|
|
||||
|
courseCodes2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=SSR_CLSRCH_WRK2_SSR_ALPHANUM_{1}&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=vIUgl6ZXw045S07EPbQw4RDzv7NmKCDdJFdT4CTRQNM%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={2}" |
||||
|
|
||||
|
payload2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=%23ICSave&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" |
||||
|
|
||||
|
payload = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&SSR_CLSRCH_WRK_SUBJECT$75$$0={1}&CLASS_SRCH_WRK2_STRM$45$={2}" |
||||
|
|
||||
|
|
||||
|
year = dt.date.today().year |
||||
|
month = dt.date.today().month |
||||
|
|
||||
|
days = { |
||||
|
"Mo" : 0, |
||||
|
"Tu" : 1, |
||||
|
"We" : 2, |
||||
|
"Th" : 3, |
||||
|
"Fr" : 4, |
||||
|
"Sa" : 5, |
||||
|
"Su" : 6 |
||||
|
} |
||||
|
|
||||
|
day_descs = { |
||||
|
"Mo" : "Monday Mon Mo", |
||||
|
"Tu" : "Tuesday Tues Tu Tue", |
||||
|
"We" : "Wednesday Wed We", |
||||
|
"Th" : "Thursday Th Thurs", |
||||
|
"Fr" : "Friday Fr Fri", |
||||
|
"Sa" : "Saturday Sat Sa", |
||||
|
"Su" : "Sunday Su Sun", |
||||
|
"T" : "TBA" |
||||
|
} |
||||
|
|
||||
|
def timeparse(time): |
||||
|
""" |
||||
|
Parse the time into numbers |
||||
|
""" |
||||
|
if len(time) == 7: |
||||
|
hour = int(time[0:2]) |
||||
|
minutes = int(time[3:5]) |
||||
|
half = time[5:7] |
||||
|
else: |
||||
|
hour = int(time[0]) |
||||
|
minutes = int(time[2:4]) |
||||
|
half = time[4:6] |
||||
|
if half == "PM": |
||||
|
if hour < 12: |
||||
|
hour = hour + 12 |
||||
|
|
||||
|
return (str(hour), str(minutes), half) |
||||
|
|
||||
|
class Class(object): |
||||
|
def __init__(self, dept, title, sections): |
||||
|
self.title = title.encode("UTF-8") |
||||
|
self.sections = sections |
||||
|
self.dept = dept |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return repr((self.title, self.sections)) |
||||
|
|
||||
|
def __iter__(self): |
||||
|
return iter((self.title, sec) for sec in self.sections) |
||||
|
|
||||
|
def hasCode(self): |
||||
|
splitted = self.title.strip().split(" ") |
||||
|
return ((len(splitted) >= 2) and |
||||
|
(splitted[0].upper() == splitted[0]) and |
||||
|
(splitted[1].upper() == splitted[1])) |
||||
|
|
||||
|
@property |
||||
|
def code(self): |
||||
|
if self.hasCode(): |
||||
|
return self.title.strip().split(" ")[1].strip() |
||||
|
return False |
||||
|
|
||||
|
@property |
||||
|
def books(self): |
||||
|
if self.dept and self.code: |
||||
|
return textbookInfo(self.dept, self.code, withPrices=True) |
||||
|
return False |
||||
|
|
||||
|
@total_ordering |
||||
|
class Section(dict): |
||||
|
def __init__(self, time, loc, prof, sem): |
||||
|
self.time = time.encode("UTF-8") |
||||
|
self.loc = loc.encode("UTF-8") |
||||
|
self.prof = prof.encode("UTF-8") |
||||
|
self.sem = sem.encode("UTF-8") |
||||
|
self._date = False |
||||
|
self._day = False |
||||
|
|
||||
|
@property |
||||
|
def date(self): |
||||
|
if self.time != "TBA": |
||||
|
day, start, _, end = self.time.split() |
||||
|
|
||||
|
if self._day: |
||||
|
assert len(self._day) == 2 |
||||
|
day = self._day |
||||
|
else: |
||||
|
day = [day[n:n+2] for n in xrange(0, len(day)-1, 2)] |
||||
|
|
||||
|
self._date = (day, timeparse(start), timeparse(end)) |
||||
|
|
||||
|
return self._date |
||||
|
|
||||
|
return self.time |
||||
|
|
||||
|
@property |
||||
|
def day(self): |
||||
|
return self.date[0] |
||||
|
|
||||
|
@property |
||||
|
def start(self): |
||||
|
return self.date[1][0] + self.date[1][1] |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return (""" |
||||
|
Time = %s, Location = %s, Instructor = %s, Semester Running = %s |
||||
|
""" % (self.date, self.loc, self.prof, self.sem)) |
||||
|
def __gt__(self, x): |
||||
|
if isinstance(self.day, list): |
||||
|
raise NotImplementedError |
||||
|
|
||||
|
if (self.date == "TBA" or |
||||
|
x.date == "TBA"): |
||||
|
return False |
||||
|
|
||||
|
return ((days[self.day] > days[x.day]) or |
||||
|
((self.day == x.day) and |
||||
|
(self.start > x.start))) |
||||
|
|
||||
|
def __eq__(self, x): |
||||
|
return (x.date == self.date and |
||||
|
x.prof == self.prof and |
||||
|
x.loc == self.loc and |
||||
|
x.sem == self.sem) |
||||
|
|
||||
|
|
||||
|
def getStateNum(html): |
||||
|
""" |
||||
|
Get the state num from Mosaic |
||||
|
This is unique to each requester |
||||
|
""" |
||||
|
parsed = lxh.fromstring(html) |
||||
|
return parsed.xpath(".//input[@name=\"ICStateNum\"]")[0].value |
||||
|
|
||||
|
def parseSection(section): |
||||
|
cols = section.xpath(".//td") |
||||
|
assert len(cols) == 4 |
||||
|
time, loc, prof, sem = [col.text_content().encode("UTF-8").strip() for col in cols] |
||||
|
|
||||
|
classinfo = Section(time, loc, prof, sem) |
||||
|
return classinfo |
||||
|
|
||||
|
def getSectionInfo(table): |
||||
|
trs = table.xpath(".//tr") |
||||
|
for tr in trs: |
||||
|
if tr.xpath("@id") and search(r"SSR_CLSRCH", tr.xpath("@id")[0]): |
||||
|
yield parseSection(tr) |
||||
|
|
||||
|
def parseColumns(subject, html): |
||||
|
parsed = lxh.fromstring(html) |
||||
|
|
||||
|
classInfo = (list(getSectionInfo(table)) for table in |
||||
|
islice((table for table in parsed.xpath(".//table") |
||||
|
if table.xpath("@id") and |
||||
|
search(r"ICField[0-9]+\$scroll", table.xpath("@id")[0])), 1, sys.maxint)) |
||||
|
|
||||
|
classNames = ((subject, span.text_content().strip()) for span in parsed.xpath(".//span") |
||||
|
if span.xpath("@id") and |
||||
|
search(r"DERIVED_CLSRCH_DESCR", span.xpath("@id")[0])) |
||||
|
|
||||
|
return zip(classNames, classInfo) |
||||
|
|
||||
|
def getCodes(html): |
||||
|
parsed = lxh.fromstring(html) |
||||
|
|
||||
|
return (code.text_content().encode("UTF-8") for code in |
||||
|
parsed.xpath("//span") |
||||
|
if code.xpath("@id") and |
||||
|
search(r"SSR_CLSRCH_SUBJ_SUBJECT\$[0-9]+", code.xpath("@id")[0])) |
||||
|
|
||||
|
class MosReq(object): |
||||
|
def __init__(self, semester): |
||||
|
self.semester = semester |
||||
|
s = requests.Session() |
||||
|
resp = s.get(baseurl, allow_redirects=True, headers=custom_headers).content |
||||
|
|
||||
|
# Let the server set some cookies before doing the searching |
||||
|
cookies = {} |
||||
|
for key, val in s.cookies.iteritems(): |
||||
|
cookies[key] = val |
||||
|
self.cookies = cookies |
||||
|
self.statenum = False |
||||
|
self.codes_ = [] |
||||
|
|
||||
|
def getlist(self, subject): |
||||
|
sys.stderr.write("Getting " + subject + "\n") |
||||
|
first_req = requests.get(searchurl, cookies=self.cookies).content |
||||
|
# for some reason Mosaic wants us to request it twice, ?????????????????? |
||||
|
self.statenum = getStateNum(first_req) |
||||
|
first_req = requests.post(searchurl, |
||||
|
data=payload.format(self.statenum, subject, self.semester), |
||||
|
cookies=self.cookies, |
||||
|
allow_redirects=False, |
||||
|
headers=custom_headers).content |
||||
|
# we make a first request to get the ICStateNum in case it thinks there are too many results |
||||
|
try: |
||||
|
self.statenum = getStateNum(first_req) |
||||
|
except IndexError: |
||||
|
pass |
||||
|
if "Your search will return over" in first_req: |
||||
|
|
||||
|
return requests.post(searchurl, |
||||
|
data=payload2.format(self.statenum, self.semester), |
||||
|
cookies=self.cookies, |
||||
|
allow_redirects=False, |
||||
|
headers=custom_headers).content |
||||
|
else: |
||||
|
return first_req |
||||
|
|
||||
|
def classes(self, subject): |
||||
|
return list(parseColumns(subject, self.getlist(subject))) |
||||
|
|
||||
|
def getCodes(self, letter): |
||||
|
sys.stderr.write("Getting letter " + letter + "\n") |
||||
|
first_req = requests.get(searchurl, cookies=self.cookies).content |
||||
|
self.statenum = getStateNum(first_req) |
||||
|
|
||||
|
self.statenum = getStateNum(requests.post(searchurl, |
||||
|
data=courseCodes1.format(self.statenum, self.semester), |
||||
|
cookies=self.cookies, |
||||
|
headers=custom_headers).content) |
||||
|
|
||||
|
return getCodes(requests.post(searchurl, |
||||
|
data=courseCodes2.format(self.statenum, letter, self.semester), |
||||
|
cookies=self.cookies, |
||||
|
allow_redirects=False, |
||||
|
headers=custom_headers).content) |
||||
|
@property |
||||
|
def codes(self): |
||||
|
if not self.codes_: |
||||
|
self.codes_ = list(chain.from_iterable( |
||||
|
map((lambda l: |
||||
|
self.getCodes(chr(l))), |
||||
|
xrange(65, 91)))) |
||||
|
return self.codes_ |
||||
|
|
||||
|
def request(codes, lists, semester): |
||||
|
requester = MosReq(semester) |
||||
|
while not codes.empty(): |
||||
|
code = codes.get() |
||||
|
try: |
||||
|
lists.put(requester.classes(code)) |
||||
|
except: |
||||
|
codes.task_done() |
||||
|
return |
||||
|
codes.task_done() |
||||
|
|
||||
|
|
||||
|
class CourseInfo(object): |
||||
|
def __init__(self, threadcount, semester): |
||||
|
self._codes = False |
||||
|
self.threadcount = threadcount |
||||
|
self.semester = semester |
||||
|
|
||||
|
@property |
||||
|
def codes(self): |
||||
|
if not self._codes: |
||||
|
req = MosReq(self.semester) |
||||
|
self._codes = req.codes |
||||
|
return self._codes |
||||
|
|
||||
|
def classes(self): |
||||
|
qcodes = q.Queue() |
||||
|
for code in self.codes: |
||||
|
qcodes.put(code) |
||||
|
lists = q.Queue() |
||||
|
threads = [] |
||||
|
thread = None |
||||
|
for i in xrange(self.threadcount): |
||||
|
thread = thd.Thread(group=None, target=request, args=(qcodes, lists, self.semester)) |
||||
|
threads.append(thread) |
||||
|
thread.start() |
||||
|
qcodes.join() |
||||
|
for t in threads: |
||||
|
t.join() |
||||
|
|
||||
|
sections = [] |
||||
|
while not lists.empty(): |
||||
|
sections.append(lists.get()) |
||||
|
|
||||
|
for cl in chain.from_iterable(sections): |
||||
|
new_sections = [] |
||||
|
for sec in cl[1]: |
||||
|
if len(sec.day) > 1: |
||||
|
for day in sec.day: |
||||
|
new_sections.append(copy.deepcopy(sec)) |
||||
|
new_sections[-1]._day = day |
||||
|
else: |
||||
|
sec._day = sec.day[0] |
||||
|
new_sections.append(sec) |
||||
|
yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections)) |
||||
|
|
||||
|
def getCourses(semester, threadcount=10): |
||||
|
return CourseInfo(threadcount, semester).classes() |
||||
|
|
||||
|
def allCourses(): |
||||
|
return chain.from_iterable( |
||||
|
(getCourses(sem, threadcount=10) |
||||
|
for sem in (fall, winter, spring_summer))) |
||||
|
|
||||
|
#for course in allCourses(): |
||||
|
#sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, course.books)) |
||||
|
#print course.sections |
@ -0,0 +1,9 @@ |
|||||
|
from oersearch import Search |
||||
|
from classes import getCourses |
||||
|
from sylla import getTextbooks |
||||
|
|
||||
|
mcmasterSearch = Search("McMaster") |
||||
|
|
||||
|
mcmasterSearch.setup(getCourses) |
||||
|
|
||||
|
mcmasterSearch.run() |
@ -0,0 +1,117 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from sys import argv |
||||
|
from itertools import chain, islice, izip_longest, izip as zip |
||||
|
from re import search, sub |
||||
|
from functools import total_ordering |
||||
|
from re import sub |
||||
|
|
||||
|
import datetime as dt |
||||
|
import lxml.html as lxh |
||||
|
import requests |
||||
|
|
||||
|
# Purpose of this module is to download and parse syllabi from various departments |
||||
|
# In order to be corellated with individual courses |
||||
|
|
||||
|
class Price(object): |
||||
|
def __init__(self, amnt, status): |
||||
|
self.dollars = float(amnt[1:]) |
||||
|
self.status = status |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return "$%s %s" % (repr(self.dollars), self.status) |
||||
|
|
||||
|
|
||||
|
class Book(object): |
||||
|
def __init__(self, title, price): |
||||
|
self.title = title |
||||
|
self.price = price |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return '["%s", "%s"]' % (self.title, repr(self.price)) |
||||
|
|
||||
|
|
||||
|
def grouper(n, iterable, fillvalue=None): |
||||
|
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" |
||||
|
args = [iter(iterable)] * n |
||||
|
return izip_longest(fillvalue=fillvalue, *args) |
||||
|
|
||||
|
searchUrl = "https://campusstore.mcmaster.ca/cgi-mcm/ws/txsub.pl?wsDEPTG1=%s&wsDEPTDESC1=&wsCOURSEG1=%s&crit_cnt=1" |
||||
|
|
||||
|
def normalize(word): |
||||
|
if len(word) > 1: |
||||
|
return ("%s%s" % |
||||
|
(word[0].upper(), |
||||
|
"".join(word[1:]).lower())) |
||||
|
return word |
||||
|
|
||||
|
def parseAuthor(author): |
||||
|
split = author.split(" ") |
||||
|
if len(split) <= 1: |
||||
|
return author |
||||
|
lastname = split[0] |
||||
|
firstname = split[1] |
||||
|
return "%s %s" % (firstname, lastname) |
||||
|
|
||||
|
def normwords(phrase): |
||||
|
words = phrase.split(" ") |
||||
|
return " ".join(map(normalize, words)) |
||||
|
|
||||
|
def books(dept, code, withPrices): |
||||
|
""" |
||||
|
Snatch me up a book title or three |
||||
|
""" |
||||
|
req = searchUrl % (dept, code) |
||||
|
|
||||
|
html = requests.get(req).text |
||||
|
|
||||
|
parsed = lxh.fromstring(html) |
||||
|
|
||||
|
pricelist = prices(parsed) |
||||
|
|
||||
|
for div in parsed.xpath(".//div"): |
||||
|
if (div.attrib.has_key("id") and |
||||
|
"prodDesc" in div.attrib["id"]): |
||||
|
|
||||
|
textbook = div.text_content() |
||||
|
author = sub(r',', '', |
||||
|
"".join( |
||||
|
(div.getparent() |
||||
|
.xpath(".//span[@class='inline']") |
||||
|
[0].text_content() |
||||
|
.split(":")[1:])).strip()) |
||||
|
price = pricelist.pop() |
||||
|
if withPrices: |
||||
|
yield (normwords(textbook), normwords(author), repr(price)) |
||||
|
else: |
||||
|
yield (normwords(textbook), normwords(author)) |
||||
|
|
||||
|
def prices(html): |
||||
|
""" |
||||
|
Get the prices from a search result page |
||||
|
""" |
||||
|
ps = [ |
||||
|
p.getparent().text_content().split()[0] |
||||
|
for p in html.xpath("//p/input[@type='checkbox']") |
||||
|
] |
||||
|
|
||||
|
try: |
||||
|
amts, stats = zip(*list(reversed(list(grouper(2, ps))))) |
||||
|
return map(Price, amts, stats) |
||||
|
except ValueError: |
||||
|
return [] |
||||
|
|
||||
|
def textbookInfo(dept, code, withPrices=False): |
||||
|
""" |
||||
|
Return all the textbooks for a course |
||||
|
""" |
||||
|
return list(books(dept, code, withPrices)) |
||||
|
|
||||
|
def humanities(): |
||||
|
""" |
||||
|
Download humanities syllabi |
||||
|
""" |
||||
|
return [] |
||||
|
|
||||
|
# Example, getting the course info for Personality Theory (PSYCH = Department, 2B03 = Course code) |
||||
|
# print list(courseInfo("PSYCH", "2B03")) |
@ -0,0 +1,24 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from urllib import quote |
||||
|
from json import loads, dumps |
||||
|
|
||||
|
import requests as req |
||||
|
|
||||
|
#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" |
||||
|
searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' |
||||
|
|
||||
|
def bookUrls(title, author): |
||||
|
print title, author |
||||
|
if ":" in title: |
||||
|
title = title.split(":")[0] |
||||
|
requrl = searchurl % (quote(author), quote(title)) |
||||
|
results = loads(req.get(requrl).text) |
||||
|
for result in results["docs"][0:2]: |
||||
|
if result.has_key("edition_key"): |
||||
|
yield "https://openlibrary.org/books/%s" % result["edition_key"][0] |
||||
|
|
||||
|
# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' |
||||
|
|
||||
|
#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): |
||||
|
#print book |
@ -0,0 +1,153 @@ |
|||||
|
##! /usr/bin/python2 |
||||
|
from itertools import groupby, chain |
||||
|
from sys import stdout |
||||
|
from functools import partial |
||||
|
from json import dumps |
||||
|
|
||||
|
def gensymer(): |
||||
|
n = [0] |
||||
|
def inner(): |
||||
|
result = str(n[0]) |
||||
|
n[0] += 1 |
||||
|
return result |
||||
|
return inner |
||||
|
|
||||
|
gensym = gensymer() |
||||
|
|
||||
|
def printTrie(graph, prev, trie, weight): |
||||
|
new_node = str(gensym()) |
||||
|
graph.node(new_node, "%s" % trie.letter) |
||||
|
graph.edge(prev, new_node, label="%.2f" % weight) |
||||
|
if not trie.children: |
||||
|
return |
||||
|
for child, weight in zip(trie.children, trie.ws): |
||||
|
printTrie(graph, new_node, child, weight) |
||||
|
|
||||
|
|
||||
|
class Trie(object): |
||||
|
def __init__(self, letter, children, ws): |
||||
|
self.letter = letter |
||||
|
self.children = children |
||||
|
self.ws = ws |
||||
|
|
||||
|
def probweight(suffixes): |
||||
|
weights = [float(s["value"]) for s in suffixes] |
||||
|
s = float(sum(weights)) |
||||
|
ws = [w/s for w in weights] |
||||
|
return ws |
||||
|
|
||||
|
def buildtrie(trie, suffixes): |
||||
|
""" |
||||
|
Build a trie, also known as a prefix tree, of all the possible completions |
||||
|
""" |
||||
|
trie.children = [] |
||||
|
for letter, suffs in suffixes: |
||||
|
ped = partition(suffs) |
||||
|
if any(map(lambda p: p[0], ped)): |
||||
|
# check if there are any children |
||||
|
trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) |
||||
|
else: |
||||
|
# we've reached the end of this word so just include the final letter |
||||
|
# [1] = there is a probability of 1 of reaching this single leaf node, |
||||
|
# since it is the only possible completion here |
||||
|
trie.children.append(Trie(letter, [], [1])) |
||||
|
return trie |
||||
|
|
||||
|
|
||||
|
def keyf(x): |
||||
|
if not x["key"]: |
||||
|
return "" |
||||
|
return x["key"][0] |
||||
|
|
||||
|
def tails(words): |
||||
|
for word in words: |
||||
|
yield { |
||||
|
"key" : word["key"][1:], |
||||
|
"value" : word["value"] |
||||
|
} |
||||
|
|
||||
|
def partition(words): |
||||
|
""" |
||||
|
Partition the words into different prefixes based on the first character |
||||
|
""" |
||||
|
groups = [ |
||||
|
(g[0], list(tails(g[1]))) |
||||
|
for g in groupby( |
||||
|
sorted(words, key=keyf), |
||||
|
key=keyf) |
||||
|
] |
||||
|
return groups |
||||
|
|
||||
|
|
||||
|
def flatten_helper(letter, trie): |
||||
|
return ([letter + child.letter for |
||||
|
child in trie.children], trie.children) |
||||
|
|
||||
|
def flatten(trie): |
||||
|
if not trie.children: |
||||
|
return trie.letter |
||||
|
prefixes, suffixes = flatten_helper(trie.letter, trie) |
||||
|
return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] |
||||
|
|
||||
|
def flattenlist(xs): |
||||
|
locs = [] |
||||
|
for x in xs: |
||||
|
if not isinstance(x, list): |
||||
|
locs.append(x) |
||||
|
else: |
||||
|
locs.extend(flattenlist(x)) |
||||
|
return locs |
||||
|
|
||||
|
def matchc(trie, prefix): |
||||
|
c = None |
||||
|
if len(prefix) > 1: |
||||
|
c = prefix[0] |
||||
|
else: |
||||
|
c = prefix |
||||
|
return [ch for ch in trie.children if ch.letter == c] |
||||
|
|
||||
|
def match(trie, word): |
||||
|
if not word: |
||||
|
return [] |
||||
|
m = matchc(trie, word[0]) |
||||
|
if not m: |
||||
|
return [] |
||||
|
else: |
||||
|
return [m[0]] + match(m[0], word[1:]) |
||||
|
|
||||
|
def complete(trie, word): |
||||
|
m = match(trie, word) |
||||
|
if len(word) != len(m): |
||||
|
return False |
||||
|
completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] |
||||
|
if len(completions) > 10: |
||||
|
return dumps(completions[0:10]) |
||||
|
return dumps(completions) |
||||
|
|
||||
|
def sortTrie(trie): |
||||
|
""" |
||||
|
Sort the children of each node in descending order |
||||
|
of the probability that each child would be the completion |
||||
|
of whatever that word is |
||||
|
""" |
||||
|
if not trie.children: |
||||
|
return |
||||
|
sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) |
||||
|
trie.children = [x[0] for x in sortedChilds] |
||||
|
trie.ws = [x[1] for x in sortedChilds] |
||||
|
for child in trie.children: |
||||
|
sortTrie(child) |
||||
|
|
||||
|
def toTrie(words): |
||||
|
for word in words: |
||||
|
word["key"] = word["key"].lower() |
||||
|
trie = buildtrie(Trie("", [], [1]), partition(words)) |
||||
|
trie.ws = [1]*len(trie.children) |
||||
|
sortTrie(trie) |
||||
|
return trie |
||||
|
|
||||
|
def testkey(w): |
||||
|
return { |
||||
|
"key" : w, |
||||
|
"value" : "1" |
||||
|
} |
@ -0,0 +1,237 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
import elasticsearch |
||||
|
|
||||
|
from elasticsearch_dsl import FacetedSearch, Search, Q |
||||
|
from elasticsearch_dsl.aggs import Terms, DateHistogram |
||||
|
from sys import exit, stderr |
||||
|
from json import dumps, loads |
||||
|
from itertools import chain, imap |
||||
|
|
||||
|
from hashlib import sha1 |
||||
|
|
||||
|
from textbookExceptions import UnIndexable |
||||
|
|
||||
|
from mcmaster.classes import allCourses |
||||
|
|
||||
|
# Generic instance of elasticsearch right now |
||||
|
es = elasticsearch.Elasticsearch() |
||||
|
|
||||
|
def summarize(text): |
||||
|
splitted = text.split(" ") |
||||
|
if len(splitted) > 4: |
||||
|
return " ".join(splitted[0:4]) + ".." |
||||
|
return text |
||||
|
|
||||
|
def sectionToJSON(section): |
||||
|
return { |
||||
|
"prof" : section.prof, |
||||
|
"sem" : section.sem, |
||||
|
"day" : section.day |
||||
|
} |
||||
|
|
||||
|
def classToJSON(clss): |
||||
|
return { |
||||
|
"title" : clss.title, |
||||
|
"sections" : map(sectionToJSON, clss.sections), |
||||
|
"dept" : clss.dept, |
||||
|
"code" : clss.code, |
||||
|
"books" : list(clss.books) if clss.books else [] |
||||
|
} |
||||
|
|
||||
|
|
||||
|
def truncate(docid): |
||||
|
""" |
||||
|
Truncate a document id to 12 digits |
||||
|
The document ID should be based on a |
||||
|
hash of unique identifiers |
||||
|
""" |
||||
|
return int(str(docid)[0:12]) |
||||
|
|
||||
|
def hashsec(course): |
||||
|
""" |
||||
|
Hash a course into a usable id |
||||
|
""" |
||||
|
if not course["code"]: |
||||
|
code = "" |
||||
|
else: |
||||
|
code = course["code"] |
||||
|
if not course["title"]: |
||||
|
title = "" |
||||
|
else: |
||||
|
title = course["title"] |
||||
|
|
||||
|
if not course["sections"] or len(course["sections"]) < 1: |
||||
|
course["sections"][0] = "" |
||||
|
|
||||
|
if not (code or title): |
||||
|
raise UnIndexable(course) |
||||
|
|
||||
|
h = sha1() |
||||
|
h.update(code + title + course["sections"][0]["sem"]) |
||||
|
return int(h.hexdigest(), 16) |
||||
|
|
||||
|
def createIndex(name): |
||||
|
""" |
||||
|
This creates a new index in elasticsearch |
||||
|
An index is like a schema in a regular database |
||||
|
Create an elasticsearch index |
||||
|
|
||||
|
""" |
||||
|
indices = elasticsearch.client.IndicesClient(es) |
||||
|
|
||||
|
print indices.create(name) |
||||
|
with open("./course.json", "r") as mapping: |
||||
|
print indices.put_mapping("course", loads(mapping.read()), name) |
||||
|
|
||||
|
def indexListing(course): |
||||
|
""" |
||||
|
Index a specific course in the database (using the courses index) |
||||
|
example, |
||||
|
{ |
||||
|
'books': [], |
||||
|
'dept': 'COLLAB', |
||||
|
'code': '2C03', |
||||
|
'sections': [ |
||||
|
{ |
||||
|
'prof': 'Lisa Pender', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Mo' |
||||
|
}, |
||||
|
{ |
||||
|
'prof': 'Staff', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Th' |
||||
|
} |
||||
|
], |
||||
|
'title': 'COLLAB 2C03 - Sociology I' |
||||
|
} |
||||
|
|
||||
|
""" |
||||
|
courseID = hashsec(course) |
||||
|
print es.index(index="oersearch", |
||||
|
doc_type="course", |
||||
|
id=courseID, |
||||
|
body=course) |
||||
|
|
||||
|
# For every course we index, we also create a resource for it |
||||
|
# This should be an idempotent operation because we're putting it in couchdb |
||||
|
# And we're using the id obtained from the hash function, so it should just update the document |
||||
|
# no need to delete anything |
||||
|
#try: |
||||
|
#courseDept = course[0]["title"].strip().split(" ")[0].strip() |
||||
|
#courseCode = course[0]["title"].strip().split(" ")[1].strip() |
||||
|
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) |
||||
|
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) |
||||
|
#except: |
||||
|
#print "Couldn't create the resource associated with %s" % course |
||||
|
|
||||
|
def termSearch(field): |
||||
|
""" |
||||
|
Make a term search (exact match) |
||||
|
""" |
||||
|
def t(term): |
||||
|
q = Q("term", |
||||
|
**{ |
||||
|
"sections."+field : term |
||||
|
}) |
||||
|
return q |
||||
|
return t |
||||
|
|
||||
|
def search(field): |
||||
|
""" |
||||
|
Make a match search |
||||
|
""" |
||||
|
def s(term): |
||||
|
q = Q("match", |
||||
|
**{ |
||||
|
field : term |
||||
|
}) |
||||
|
return q |
||||
|
return s |
||||
|
|
||||
|
def join(x, y): |
||||
|
""" |
||||
|
Join two queries |
||||
|
""" |
||||
|
return x & y |
||||
|
|
||||
|
def filterSections(secs): |
||||
|
""" |
||||
|
Get rid of tutorial sections |
||||
|
because they almost always have "Staff" as the instructor |
||||
|
This is just a heuristic of course |
||||
|
""" |
||||
|
filtered = [s for s in secs.sections if "Staff" not in s.prof] |
||||
|
if len(filtered) > 0: |
||||
|
return filtered |
||||
|
return False |
||||
|
|
||||
|
def searchTerms(terms): |
||||
|
""" |
||||
|
Run a search for courses |
||||
|
""" |
||||
|
|
||||
|
# A list of all the queries we want to run |
||||
|
qs = [searchers[field](term) for |
||||
|
field, term in |
||||
|
terms.iteritems() if |
||||
|
term and searchers.has_key(field)] |
||||
|
|
||||
|
if not qs: |
||||
|
# No queries = no results |
||||
|
return dumps([]) |
||||
|
|
||||
|
# Reduce joins all of the queries into one query |
||||
|
# It will search for the conjunction of all of them |
||||
|
# So that means it cares about each query equally |
||||
|
q = reduce(join, qs) |
||||
|
|
||||
|
s = (Search(using=es, index="oersearch") |
||||
|
.query(q))[0:100] # only return up to 100 results for now |
||||
|
|
||||
|
results = s.execute() |
||||
|
|
||||
|
filtered = [ |
||||
|
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials |
||||
|
for secs in results |
||||
|
if filterSections(secs) |
||||
|
] |
||||
|
results = [] |
||||
|
for obj, secs in filtered: |
||||
|
# Add the truncated course id |
||||
|
# This is used to point to the resource page for that course |
||||
|
secs["id"] = truncate(obj.meta.id) |
||||
|
secs["title"] = obj.title |
||||
|
if obj["dept"] not in secs["title"]: |
||||
|
secs["dept"] = obj.dept |
||||
|
if obj.books: |
||||
|
secs["books"] = [ |
||||
|
{ |
||||
|
"booktitle" : summarize(book[0].encode("ASCII")), |
||||
|
"bookauthor" : book[1].encode("ASCII"), |
||||
|
"bookprice" : book[2].encode("ASCII") |
||||
|
} |
||||
|
for book in obj.books |
||||
|
] |
||||
|
else: |
||||
|
secs["books"] = "" |
||||
|
results.append(secs) |
||||
|
|
||||
|
return dumps(results) |
||||
|
|
||||
|
|
||||
|
searchers = { |
||||
|
"title" : search("title"), |
||||
|
"loc" : search("loc"), |
||||
|
"time" : search("time"), |
||||
|
"prof" : search("prof"), |
||||
|
"day" : search("day"), |
||||
|
} |
||||
|
|
||||
|
#print searchTerms({"title" : "PHILOS"}) |
||||
|
|
||||
|
#for c in imap(classToJSON, allCourses()): |
||||
|
#try: |
||||
|
#print indexListing(c) |
||||
|
#except UnIndexable as e: |
@ -0,0 +1,34 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from urllib import quote |
||||
|
from json import loads, dumps |
||||
|
|
||||
|
import requests as req |
||||
|
|
||||
|
searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" |
||||
|
|
||||
|
def searchIA(title, author): |
||||
|
""" |
||||
|
Do a search on The Internet Archive for a book |
||||
|
""" |
||||
|
print "running a search" |
||||
|
requrl = searchUrl.format(quote(title + " " + author)) |
||||
|
try: |
||||
|
results = loads(req.get(requrl).text[9:][0:-1]) |
||||
|
except ValueError: |
||||
|
return [] |
||||
|
|
||||
|
rownum = results["responseHeader"]["params"]["rows"] |
||||
|
if rownum < 1: |
||||
|
print "Couldn't find results for %s %s" % (title, author) |
||||
|
return [] |
||||
|
docs = results["response"]["docs"] |
||||
|
urls = [] |
||||
|
for result in results["response"]["docs"][0:3]: |
||||
|
urls.append("https://archive.org/details/%s" % result["identifier"]) |
||||
|
return urls |
||||
|
|
||||
|
|
||||
|
# Example, search for David Hume's Enquiry Concerning Human Understanding |
||||
|
#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): |
||||
|
#print url |
@ -0,0 +1,62 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from sys import argv |
||||
|
from hashlib import sha1 |
||||
|
|
||||
|
def truncate(docid): |
||||
|
""" |
||||
|
Truncate a document id to 12 digits |
||||
|
The document ID should be based on a |
||||
|
hash of unique identifiers |
||||
|
""" |
||||
|
return int(str(docid)[0:12]) |
||||
|
|
||||
|
def createResource(textbookInfo, course, dept, coursecode, docid): |
||||
|
""" |
||||
|
Create a document associated with a course |
||||
|
This document contains any/all resources associated |
||||
|
with that course |
||||
|
|
||||
|
example, |
||||
|
{ |
||||
|
'books': [], |
||||
|
'dept': 'COLLAB', |
||||
|
'code': '2C03', |
||||
|
'sections': [ |
||||
|
{ |
||||
|
'prof': 'Lisa Pender', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Mo' |
||||
|
}, |
||||
|
{ |
||||
|
'prof': 'Staff', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Th' |
||||
|
} |
||||
|
], |
||||
|
'title': 'COLLAB 2C03 - Sociology I' |
||||
|
} |
||||
|
""" |
||||
|
textbooks = textbookInfo(dept.strip(), coursecode.strip()) |
||||
|
|
||||
|
# We truncate the id so we can have nicer looking URLs |
||||
|
# Since the id will be used to point to the resource page for that course |
||||
|
_id = str(truncate(docid)) |
||||
|
|
||||
|
fields = { |
||||
|
"_id" : _id, |
||||
|
"textbooks" : textbooks, |
||||
|
"coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), |
||||
|
"courseinfo" : course |
||||
|
#"Syllabus" : "blah" |
||||
|
} |
||||
|
try: |
||||
|
revisions = list(localdb.revisions(_id)) |
||||
|
if not revisions: |
||||
|
return localdb.save(fields) |
||||
|
else: |
||||
|
rev = dict(revisions[0])["_rev"] |
||||
|
fields["_rev"] = rev |
||||
|
return localdb.save(fields) |
||||
|
except ResourceConflict: |
||||
|
print "Resource for %s already exists, not creating a new one" % (docid) |
@ -0,0 +1,14 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
# predictive data |
||||
|
# switch to elasticsearch's prediction |
||||
|
|
||||
|
|
||||
|
|
||||
|
import database |
||||
|
import predictions |
||||
|
|
||||
|
class GOASearch(object): |
||||
|
def __init__(self): |
||||
|
return self |
||||
|
|
@ -0,0 +1,24 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from urllib import quote |
||||
|
from json import loads, dumps |
||||
|
|
||||
|
import requests as req |
||||
|
|
||||
|
#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" |
||||
|
searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' |
||||
|
|
||||
|
def bookUrls(title, author): |
||||
|
print title, author |
||||
|
if ":" in title: |
||||
|
title = title.split(":")[0] |
||||
|
requrl = searchurl % (quote(author), quote(title)) |
||||
|
results = loads(req.get(requrl).text) |
||||
|
for result in results["docs"][0:2]: |
||||
|
if result.has_key("edition_key"): |
||||
|
yield "https://openlibrary.org/books/%s" % result["edition_key"][0] |
||||
|
|
||||
|
# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' |
||||
|
|
||||
|
#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): |
||||
|
#print book |
@ -0,0 +1,153 @@ |
|||||
|
##! /usr/bin/python2 |
||||
|
from itertools import groupby, chain |
||||
|
from sys import stdout |
||||
|
from functools import partial |
||||
|
from json import dumps |
||||
|
|
||||
|
def gensymer(): |
||||
|
n = [0] |
||||
|
def inner(): |
||||
|
result = str(n[0]) |
||||
|
n[0] += 1 |
||||
|
return result |
||||
|
return inner |
||||
|
|
||||
|
gensym = gensymer() |
||||
|
|
||||
|
def printTrie(graph, prev, trie, weight): |
||||
|
new_node = str(gensym()) |
||||
|
graph.node(new_node, "%s" % trie.letter) |
||||
|
graph.edge(prev, new_node, label="%.2f" % weight) |
||||
|
if not trie.children: |
||||
|
return |
||||
|
for child, weight in zip(trie.children, trie.ws): |
||||
|
printTrie(graph, new_node, child, weight) |
||||
|
|
||||
|
|
||||
|
class Trie(object): |
||||
|
def __init__(self, letter, children, ws): |
||||
|
self.letter = letter |
||||
|
self.children = children |
||||
|
self.ws = ws |
||||
|
|
||||
|
def probweight(suffixes): |
||||
|
weights = [float(s["value"]) for s in suffixes] |
||||
|
s = float(sum(weights)) |
||||
|
ws = [w/s for w in weights] |
||||
|
return ws |
||||
|
|
||||
|
def buildtrie(trie, suffixes): |
||||
|
""" |
||||
|
Build a trie, also known as a prefix tree, of all the possible completions |
||||
|
""" |
||||
|
trie.children = [] |
||||
|
for letter, suffs in suffixes: |
||||
|
ped = partition(suffs) |
||||
|
if any(map(lambda p: p[0], ped)): |
||||
|
# check if there are any children |
||||
|
trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) |
||||
|
else: |
||||
|
# we've reached the end of this word so just include the final letter |
||||
|
# [1] = there is a probability of 1 of reaching this single leaf node, |
||||
|
# since it is the only possible completion here |
||||
|
trie.children.append(Trie(letter, [], [1])) |
||||
|
return trie |
||||
|
|
||||
|
|
||||
|
def keyf(x): |
||||
|
if not x["key"]: |
||||
|
return "" |
||||
|
return x["key"][0] |
||||
|
|
||||
|
def tails(words): |
||||
|
for word in words: |
||||
|
yield { |
||||
|
"key" : word["key"][1:], |
||||
|
"value" : word["value"] |
||||
|
} |
||||
|
|
||||
|
def partition(words): |
||||
|
""" |
||||
|
Partition the words into different prefixes based on the first character |
||||
|
""" |
||||
|
groups = [ |
||||
|
(g[0], list(tails(g[1]))) |
||||
|
for g in groupby( |
||||
|
sorted(words, key=keyf), |
||||
|
key=keyf) |
||||
|
] |
||||
|
return groups |
||||
|
|
||||
|
|
||||
|
def flatten_helper(letter, trie): |
||||
|
return ([letter + child.letter for |
||||
|
child in trie.children], trie.children) |
||||
|
|
||||
|
def flatten(trie): |
||||
|
if not trie.children: |
||||
|
return trie.letter |
||||
|
prefixes, suffixes = flatten_helper(trie.letter, trie) |
||||
|
return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] |
||||
|
|
||||
|
def flattenlist(xs): |
||||
|
locs = [] |
||||
|
for x in xs: |
||||
|
if not isinstance(x, list): |
||||
|
locs.append(x) |
||||
|
else: |
||||
|
locs.extend(flattenlist(x)) |
||||
|
return locs |
||||
|
|
||||
|
def matchc(trie, prefix): |
||||
|
c = None |
||||
|
if len(prefix) > 1: |
||||
|
c = prefix[0] |
||||
|
else: |
||||
|
c = prefix |
||||
|
return [ch for ch in trie.children if ch.letter == c] |
||||
|
|
||||
|
def match(trie, word): |
||||
|
if not word: |
||||
|
return [] |
||||
|
m = matchc(trie, word[0]) |
||||
|
if not m: |
||||
|
return [] |
||||
|
else: |
||||
|
return [m[0]] + match(m[0], word[1:]) |
||||
|
|
||||
|
def complete(trie, word): |
||||
|
m = match(trie, word) |
||||
|
if len(word) != len(m): |
||||
|
return False |
||||
|
completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] |
||||
|
if len(completions) > 10: |
||||
|
return dumps(completions[0:10]) |
||||
|
return dumps(completions) |
||||
|
|
||||
|
def sortTrie(trie): |
||||
|
""" |
||||
|
Sort the children of each node in descending order |
||||
|
of the probability that each child would be the completion |
||||
|
of whatever that word is |
||||
|
""" |
||||
|
if not trie.children: |
||||
|
return |
||||
|
sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) |
||||
|
trie.children = [x[0] for x in sortedChilds] |
||||
|
trie.ws = [x[1] for x in sortedChilds] |
||||
|
for child in trie.children: |
||||
|
sortTrie(child) |
||||
|
|
||||
|
def toTrie(words): |
||||
|
for word in words: |
||||
|
word["key"] = word["key"].lower() |
||||
|
trie = buildtrie(Trie("", [], [1]), partition(words)) |
||||
|
trie.ws = [1]*len(trie.children) |
||||
|
sortTrie(trie) |
||||
|
return trie |
||||
|
|
||||
|
def testkey(w): |
||||
|
return { |
||||
|
"key" : w, |
||||
|
"value" : "1" |
||||
|
} |
@ -0,0 +1,237 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
import elasticsearch |
||||
|
|
||||
|
from elasticsearch_dsl import FacetedSearch, Search, Q |
||||
|
from elasticsearch_dsl.aggs import Terms, DateHistogram |
||||
|
from sys import exit, stderr |
||||
|
from json import dumps, loads |
||||
|
from itertools import chain, imap |
||||
|
|
||||
|
from hashlib import sha1 |
||||
|
|
||||
|
from textbookExceptions import UnIndexable |
||||
|
|
||||
|
from mcmaster.classes import allCourses |
||||
|
|
||||
|
# Generic instance of elasticsearch right now |
||||
|
es = elasticsearch.Elasticsearch() |
||||
|
|
||||
|
def summarize(text): |
||||
|
splitted = text.split(" ") |
||||
|
if len(splitted) > 4: |
||||
|
return " ".join(splitted[0:4]) + ".." |
||||
|
return text |
||||
|
|
||||
|
def sectionToJSON(section): |
||||
|
return { |
||||
|
"prof" : section.prof, |
||||
|
"sem" : section.sem, |
||||
|
"day" : section.day |
||||
|
} |
||||
|
|
||||
|
def classToJSON(clss): |
||||
|
return { |
||||
|
"title" : clss.title, |
||||
|
"sections" : map(sectionToJSON, clss.sections), |
||||
|
"dept" : clss.dept, |
||||
|
"code" : clss.code, |
||||
|
"books" : list(clss.books) if clss.books else [] |
||||
|
} |
||||
|
|
||||
|
|
||||
|
def truncate(docid): |
||||
|
""" |
||||
|
Truncate a document id to 12 digits |
||||
|
The document ID should be based on a |
||||
|
hash of unique identifiers |
||||
|
""" |
||||
|
return int(str(docid)[0:12]) |
||||
|
|
||||
|
def hashsec(course): |
||||
|
""" |
||||
|
Hash a course into a usable id |
||||
|
""" |
||||
|
if not course["code"]: |
||||
|
code = "" |
||||
|
else: |
||||
|
code = course["code"] |
||||
|
if not course["title"]: |
||||
|
title = "" |
||||
|
else: |
||||
|
title = course["title"] |
||||
|
|
||||
|
if not course["sections"] or len(course["sections"]) < 1: |
||||
|
course["sections"][0] = "" |
||||
|
|
||||
|
if not (code or title): |
||||
|
raise UnIndexable(course) |
||||
|
|
||||
|
h = sha1() |
||||
|
h.update(code + title + course["sections"][0]["sem"]) |
||||
|
return int(h.hexdigest(), 16) |
||||
|
|
||||
|
def createIndex(name): |
||||
|
""" |
||||
|
This creates a new index in elasticsearch |
||||
|
An index is like a schema in a regular database |
||||
|
Create an elasticsearch index |
||||
|
|
||||
|
""" |
||||
|
indices = elasticsearch.client.IndicesClient(es) |
||||
|
|
||||
|
print indices.create(name) |
||||
|
with open("./course.json", "r") as mapping: |
||||
|
print indices.put_mapping("course", loads(mapping.read()), name) |
||||
|
|
||||
|
def indexListing(course): |
||||
|
""" |
||||
|
Index a specific course in the database (using the courses index) |
||||
|
example, |
||||
|
{ |
||||
|
'books': [], |
||||
|
'dept': 'COLLAB', |
||||
|
'code': '2C03', |
||||
|
'sections': [ |
||||
|
{ |
||||
|
'prof': 'Lisa Pender', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Mo' |
||||
|
}, |
||||
|
{ |
||||
|
'prof': 'Staff', |
||||
|
'sem': '2015/09/08 - 2015/12/08', |
||||
|
'day': 'Th' |
||||
|
} |
||||
|
], |
||||
|
'title': 'COLLAB 2C03 - Sociology I' |
||||
|
} |
||||
|
|
||||
|
""" |
||||
|
courseID = hashsec(course) |
||||
|
print es.index(index="oersearch", |
||||
|
doc_type="course", |
||||
|
id=courseID, |
||||
|
body=course) |
||||
|
|
||||
|
# For every course we index, we also create a resource for it |
||||
|
# This should be an idempotent operation because we're putting it in couchdb |
||||
|
# And we're using the id obtained from the hash function, so it should just update the document |
||||
|
# no need to delete anything |
||||
|
#try: |
||||
|
#courseDept = course[0]["title"].strip().split(" ")[0].strip() |
||||
|
#courseCode = course[0]["title"].strip().split(" ")[1].strip() |
||||
|
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) |
||||
|
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) |
||||
|
#except: |
||||
|
#print "Couldn't create the resource associated with %s" % course |
||||
|
|
||||
|
def termSearch(field): |
||||
|
""" |
||||
|
Make a term search (exact match) |
||||
|
""" |
||||
|
def t(term): |
||||
|
q = Q("term", |
||||
|
**{ |
||||
|
"sections."+field : term |
||||
|
}) |
||||
|
return q |
||||
|
return t |
||||
|
|
||||
|
def search(field): |
||||
|
""" |
||||
|
Make a match search |
||||
|
""" |
||||
|
def s(term): |
||||
|
q = Q("match", |
||||
|
**{ |
||||
|
field : term |
||||
|
}) |
||||
|
return q |
||||
|
return s |
||||
|
|
||||
|
def join(x, y): |
||||
|
""" |
||||
|
Join two queries |
||||
|
""" |
||||
|
return x & y |
||||
|
|
||||
|
def filterSections(secs): |
||||
|
""" |
||||
|
Get rid of tutorial sections |
||||
|
because they almost always have "Staff" as the instructor |
||||
|
This is just a heuristic of course |
||||
|
""" |
||||
|
filtered = [s for s in secs.sections if "Staff" not in s.prof] |
||||
|
if len(filtered) > 0: |
||||
|
return filtered |
||||
|
return False |
||||
|
|
||||
|
def searchTerms(terms): |
||||
|
""" |
||||
|
Run a search for courses |
||||
|
""" |
||||
|
|
||||
|
# A list of all the queries we want to run |
||||
|
qs = [searchers[field](term) for |
||||
|
field, term in |
||||
|
terms.iteritems() if |
||||
|
term and searchers.has_key(field)] |
||||
|
|
||||
|
if not qs: |
||||
|
# No queries = no results |
||||
|
return dumps([]) |
||||
|
|
||||
|
# Reduce joins all of the queries into one query |
||||
|
# It will search for the conjunction of all of them |
||||
|
# So that means it cares about each query equally |
||||
|
q = reduce(join, qs) |
||||
|
|
||||
|
s = (Search(using=es, index="oersearch") |
||||
|
.query(q))[0:100] # only return up to 100 results for now |
||||
|
|
||||
|
results = s.execute() |
||||
|
|
||||
|
filtered = [ |
||||
|
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials |
||||
|
for secs in results |
||||
|
if filterSections(secs) |
||||
|
] |
||||
|
results = [] |
||||
|
for obj, secs in filtered: |
||||
|
# Add the truncated course id |
||||
|
# This is used to point to the resource page for that course |
||||
|
secs["id"] = truncate(obj.meta.id) |
||||
|
secs["title"] = obj.title |
||||
|
if obj["dept"] not in secs["title"]: |
||||
|
secs["dept"] = obj.dept |
||||
|
if obj.books: |
||||
|
secs["books"] = [ |
||||
|
{ |
||||
|
"booktitle" : summarize(book[0].encode("ASCII")), |
||||
|
"bookauthor" : book[1].encode("ASCII"), |
||||
|
"bookprice" : book[2].encode("ASCII") |
||||
|
} |
||||
|
for book in obj.books |
||||
|
] |
||||
|
else: |
||||
|
secs["books"] = "" |
||||
|
results.append(secs) |
||||
|
|
||||
|
return dumps(results) |
||||
|
|
||||
|
|
||||
|
searchers = { |
||||
|
"title" : search("title"), |
||||
|
"loc" : search("loc"), |
||||
|
"time" : search("time"), |
||||
|
"prof" : search("prof"), |
||||
|
"day" : search("day"), |
||||
|
} |
||||
|
|
||||
|
#print searchTerms({"title" : "PHILOS"}) |
||||
|
|
||||
|
#for c in imap(classToJSON, allCourses()): |
||||
|
#try: |
||||
|
#print indexListing(c) |
||||
|
#except UnIndexable as e: |
@ -0,0 +1,24 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
class UnIndexable(Exception): |
||||
|
def __init__(self, course): |
||||
|
self.course = course |
||||
|
|
||||
|
@property |
||||
|
def reason(self): |
||||
|
course = self.course |
||||
|
if not course["code"] and not course["title"]: |
||||
|
message = "there was no course code and no title defined" |
||||
|
if not course["code"]: |
||||
|
message = "there was no course code defined" |
||||
|
if not course["title"]: |
||||
|
message = "there was no course title defined" |
||||
|
if not course["sections"]: |
||||
|
message = "there were no sections defined" |
||||
|
return """ |
||||
|
There was a problem with indexing this course. |
||||
|
%s |
||||
|
There could be several reasons why, my best guess is that %s |
||||
|
We need at least the course code, title, and one or more sections to index |
||||
|
|
||||
|
""" % (course, message) |
@ -0,0 +1,97 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from json import loads, load |
||||
|
from re import sub, split |
||||
|
from itertools import groupby |
||||
|
from numpy import mean |
||||
|
from operator import attrgetter |
||||
|
|
||||
|
import pygal |
||||
|
import csv |
||||
|
|
||||
|
class Textbook(object): |
||||
|
def __init__(self, dept, code, title, author, price): |
||||
|
self.dept = dept |
||||
|
self.code = code |
||||
|
self.title = title |
||||
|
self.author = author |
||||
|
self.price = float(price) |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, |
||||
|
self.code, |
||||
|
self.title, |
||||
|
self.author, |
||||
|
self.price) |
||||
|
|
||||
|
|
||||
|
def courses(): |
||||
|
with open("./books.csv", "r") as books: |
||||
|
booksreader = csv.reader(books) |
||||
|
for row in booksreader: |
||||
|
yield row |
||||
|
|
||||
|
|
||||
|
def groupDept(courselist): |
||||
|
sortedCourses = sorted(courselist, key=attrgetter("dept")) |
||||
|
for course in groupby(sortedCourses, attrgetter("dept")): |
||||
|
yield course[0], list(course[1]) |
||||
|
|
||||
|
def meanPrice(books): |
||||
|
return mean([book.price for book in books]) |
||||
|
|
||||
|
# Questions, |
||||
|
# mean cost per department |
||||
|
# mean cost per faculty |
||||
|
# mean difference between book store copies and other copies per dept and faculty |
||||
|
# number of overlapping books per faculty, do eng students benefit from that? |
||||
|
|
||||
|
# maybe a survey for students to see how often they buy books from other sources |
||||
|
# correlate with how much they could be saving? |
||||
|
|
||||
|
facultyDesc = { |
||||
|
"hum" : "Humanities", |
||||
|
"bus" : "Business", |
||||
|
"hlth" : "Health Science", |
||||
|
"eng" : "Engineering", |
||||
|
"sci" : "Science", |
||||
|
"socsci" : "Social Sciences", |
||||
|
"artsci" : "Arts & Sciences", |
||||
|
"meld" : "MELD" |
||||
|
} |
||||
|
|
||||
|
faculties = load(open("./faculties.json")) |
||||
|
|
||||
|
def categorize(dept): |
||||
|
# faculties |
||||
|
return facultyDesc.get(faculties.get(dept, False), False) |
||||
|
|
||||
|
def byFaculty(): |
||||
|
for dept, books in groupDept(courses()): |
||||
|
yield (categorize(dept), dept, books) |
||||
|
|
||||
|
def meanFacultyCosts(): |
||||
|
byfac = list(byFaculty()) |
||||
|
graph = pygal.Bar() |
||||
|
graph.title = "Mean textbook cost by faculty" |
||||
|
sortedFacs = sorted(byfac, key=lambda x: x[0]) |
||||
|
for fac in groupby(sortedFacs, lambda x: x[0]): |
||||
|
graph.add(fac[0], meanPrice(list(fac[1])[0][2])) |
||||
|
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
||||
|
return graph.render(transpose=True) |
||||
|
|
||||
|
def meanCosts(): |
||||
|
cs = groupDept(courses()) |
||||
|
graph = pygal.Bar() |
||||
|
graph.title = "Mean textbook cost by department" |
||||
|
for c in cs: |
||||
|
dept, books = c |
||||
|
graph.add(dept, meanPrice(books)) |
||||
|
#graph.render_to_file("./test_graph.svg") |
||||
|
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
||||
|
return graph.render_table(style=True, transpose=True) |
||||
|
|
||||
|
for x in courses(): |
||||
|
print x |
||||
|
#print meanCosts() |
||||
|
#print meanFacultyCosts() |
@ -0,0 +1,148 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
from functools import partial |
||||
|
from couchdb import ResourceConflict |
||||
|
|
||||
|
from flask import Flask, render_template, flash, request, send_from_directory |
||||
|
from flask_bootstrap import Bootstrap |
||||
|
from flask_appconfig import AppConfig |
||||
|
from urllib import unquote |
||||
|
from search import searchTerms |
||||
|
|
||||
|
from openlibrary import bookUrls |
||||
|
|
||||
|
from archive import searchIA |
||||
|
from urllib import quote, unquote |
||||
|
from json import dumps, loads |
||||
|
|
||||
|
from werkzeug.contrib.cache import MemcachedCache |
||||
|
cache = MemcachedCache(['127.0.0.1:11211']) |
||||
|
|
||||
|
import os |
||||
|
|
||||
|
def predict(fieldtype, term): |
||||
|
print fieldtype |
||||
|
print term |
||||
|
if not term: |
||||
|
return "[]" |
||||
|
else: |
||||
|
try: |
||||
|
cs = completers[fieldtype](term.lower()) |
||||
|
except KeyError: |
||||
|
return "[]" |
||||
|
if cs: |
||||
|
return cs |
||||
|
return "[]" |
||||
|
|
||||
|
def predictor(fieldtype): |
||||
|
def inner(request): |
||||
|
params = dict(request.args.items()) |
||||
|
return predict(fieldtype, params["term"]) |
||||
|
return inner |
||||
|
|
||||
|
def cacheit(key, thunk): |
||||
|
""" |
||||
|
Tries to find a cached version of ``key'' |
||||
|
If there is no cached version then it will |
||||
|
evaluate thunk (which must be a generator) |
||||
|
and cache that, then return the result |
||||
|
""" |
||||
|
cached = cache.get(quote(key)) |
||||
|
if cached is None: |
||||
|
result = list(thunk()) |
||||
|
cache.set(quote(key), result) |
||||
|
return result |
||||
|
return cached |
||||
|
|
||||
|
def ClassSearch(configfile=None): |
||||
|
defaults = {"Day", "Building", "Exact Location", "Department"} |
||||
|
app = Flask(__name__) |
||||
|
AppConfig(app, configfile) # Flask-Appconfig is not necessary, but |
||||
|
# highly recommend =) |
||||
|
# https://github.com/mbr/flask-appconfig |
||||
|
Bootstrap(app) |
||||
|
|
||||
|
app.config["scripts"] = "/home/wes/MGOAL/scripts" |
||||
|
app.config["styles"] = "/home/wes/MGOAL/styles" |
||||
|
|
||||
|
@app.route('/favicon.ico') |
||||
|
def favicon(): |
||||
|
return send_from_directory("/srv/http/goal/favicon.ico", |
||||
|
'favicon.ico', mimetype='image/vnd.microsoft.icon') |
||||
|
|
||||
|
|
||||
|
@app.route("/buildpred", methods=("GET", "POST")) |
||||
|
def buildpred(): |
||||
|
return predictbuild(request) |
||||
|
|
||||
|
@app.route("/locpred", methods=("GET", "POST")) |
||||
|
def locpred(): |
||||
|
return predictloc(request) |
||||
|
|
||||
|
@app.route("/daypred", methods=("GET", "POST")) |
||||
|
def daypred(): |
||||
|
return predictday(request) |
||||
|
|
||||
|
@app.route("/deptpred", methods=("GET", "POST")) |
||||
|
def deptpred(): |
||||
|
return predictdept(request) |
||||
|
|
||||
|
@app.route("/titlepred", methods=("GET", "POST")) |
||||
|
def titlepred(): |
||||
|
return predicttitle(request) |
||||
|
|
||||
|
@app.route("/", methods=("GET", "POST")) |
||||
|
def index(): |
||||
|
return render_template("search.html") |
||||
|
|
||||
|
@app.route("/fc", methods=("GET", "POST")) |
||||
|
def fc(): |
||||
|
""" Filter Courses """ |
||||
|
print "trying to get courses" |
||||
|
params = dict(request.args.items()) |
||||
|
for key, val in params.iteritems(): |
||||
|
if val in defaults: |
||||
|
del params[key] |
||||
|
results = searchTerms(params) |
||||
|
return results |
||||
|
|
||||
|
@app.route("/resources", methods=("GET", "POST")) |
||||
|
def resources(): |
||||
|
""" Get Resources """ |
||||
|
notRequired = False |
||||
|
params = loads(dict(request.args.items())["data"]) |
||||
|
print params |
||||
|
author = params["author"] |
||||
|
title = params["title"] |
||||
|
|
||||
|
if ("No Textbooks" in title or |
||||
|
"No Adoption" in title): |
||||
|
return dumps("false") |
||||
|
|
||||
|
# Cache the result of the open library search |
||||
|
openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) |
||||
|
print openlib |
||||
|
|
||||
|
# cache the result of an internet archive search |
||||
|
iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) |
||||
|
print iarchive |
||||
|
|
||||
|
if not (any(openlib) or any(iarchive)): |
||||
|
# We literally could not find ANYTHING |
||||
|
return dumps("false") |
||||
|
|
||||
|
return dumps({ |
||||
|
"iarchive" : iarchive, |
||||
|
"openlib" : openlib |
||||
|
}) |
||||
|
|
||||
|
@app.route("/scripts/<filename>") |
||||
|
def send_script(filename): |
||||
|
return send_from_directory(app.config["scripts"], filename) |
||||
|
|
||||
|
@app.route("/styles/<filename>") |
||||
|
def send_style(filename): |
||||
|
return send_from_directory(app.config["styles"], filename) |
||||
|
return app |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
ClassSearch().run(port=8001, debug=True) |
@ -0,0 +1,24 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
class UnIndexable(Exception): |
||||
|
def __init__(self, course): |
||||
|
self.course = course |
||||
|
|
||||
|
@property |
||||
|
def reason(self): |
||||
|
course = self.course |
||||
|
if not course["code"] and not course["title"]: |
||||
|
message = "there was no course code and no title defined" |
||||
|
if not course["code"]: |
||||
|
message = "there was no course code defined" |
||||
|
if not course["title"]: |
||||
|
message = "there was no course title defined" |
||||
|
if not course["sections"]: |
||||
|
message = "there were no sections defined" |
||||
|
return """ |
||||
|
There was a problem with indexing this course. |
||||
|
%s |
||||
|
There could be several reasons why, my best guess is that %s |
||||
|
We need at least the course code, title, and one or more sections to index |
||||
|
|
||||
|
""" % (course, message) |
@ -0,0 +1,97 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
|
||||
|
from json import loads, load |
||||
|
from re import sub, split |
||||
|
from itertools import groupby |
||||
|
from numpy import mean |
||||
|
from operator import attrgetter |
||||
|
|
||||
|
import pygal |
||||
|
import csv |
||||
|
|
||||
|
class Textbook(object): |
||||
|
def __init__(self, dept, code, title, author, price): |
||||
|
self.dept = dept |
||||
|
self.code = code |
||||
|
self.title = title |
||||
|
self.author = author |
||||
|
self.price = float(price) |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, |
||||
|
self.code, |
||||
|
self.title, |
||||
|
self.author, |
||||
|
self.price) |
||||
|
|
||||
|
|
||||
|
def courses(): |
||||
|
with open("./books.csv", "r") as books: |
||||
|
booksreader = csv.reader(books) |
||||
|
for row in booksreader: |
||||
|
yield row |
||||
|
|
||||
|
|
||||
|
def groupDept(courselist): |
||||
|
sortedCourses = sorted(courselist, key=attrgetter("dept")) |
||||
|
for course in groupby(sortedCourses, attrgetter("dept")): |
||||
|
yield course[0], list(course[1]) |
||||
|
|
||||
|
def meanPrice(books): |
||||
|
return mean([book.price for book in books]) |
||||
|
|
||||
|
# Questions, |
||||
|
# mean cost per department |
||||
|
# mean cost per faculty |
||||
|
# mean difference between book store copies and other copies per dept and faculty |
||||
|
# number of overlapping books per faculty, do eng students benefit from that? |
||||
|
|
||||
|
# maybe a survey for students to see how often they buy books from other sources |
||||
|
# correlate with how much they could be saving? |
||||
|
|
||||
|
facultyDesc = { |
||||
|
"hum" : "Humanities", |
||||
|
"bus" : "Business", |
||||
|
"hlth" : "Health Science", |
||||
|
"eng" : "Engineering", |
||||
|
"sci" : "Science", |
||||
|
"socsci" : "Social Sciences", |
||||
|
"artsci" : "Arts & Sciences", |
||||
|
"meld" : "MELD" |
||||
|
} |
||||
|
|
||||
|
faculties = load(open("./faculties.json")) |
||||
|
|
||||
|
def categorize(dept): |
||||
|
# faculties |
||||
|
return facultyDesc.get(faculties.get(dept, False), False) |
||||
|
|
||||
|
def byFaculty(): |
||||
|
for dept, books in groupDept(courses()): |
||||
|
yield (categorize(dept), dept, books) |
||||
|
|
||||
|
def meanFacultyCosts(): |
||||
|
byfac = list(byFaculty()) |
||||
|
graph = pygal.Bar() |
||||
|
graph.title = "Mean textbook cost by faculty" |
||||
|
sortedFacs = sorted(byfac, key=lambda x: x[0]) |
||||
|
for fac in groupby(sortedFacs, lambda x: x[0]): |
||||
|
graph.add(fac[0], meanPrice(list(fac[1])[0][2])) |
||||
|
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
||||
|
return graph.render(transpose=True) |
||||
|
|
||||
|
def meanCosts(): |
||||
|
cs = groupDept(courses()) |
||||
|
graph = pygal.Bar() |
||||
|
graph.title = "Mean textbook cost by department" |
||||
|
for c in cs: |
||||
|
dept, books = c |
||||
|
graph.add(dept, meanPrice(books)) |
||||
|
#graph.render_to_file("./test_graph.svg") |
||||
|
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
||||
|
return graph.render_table(style=True, transpose=True) |
||||
|
|
||||
|
for x in courses(): |
||||
|
print x |
||||
|
#print meanCosts() |
||||
|
#print meanFacultyCosts() |
@ -0,0 +1,148 @@ |
|||||
|
#! /usr/bin/python2 |
||||
|
from functools import partial |
||||
|
from couchdb import ResourceConflict |
||||
|
|
||||
|
from flask import Flask, render_template, flash, request, send_from_directory |
||||
|
from flask_bootstrap import Bootstrap |
||||
|
from flask_appconfig import AppConfig |
||||
|
from urllib import unquote |
||||
|
from search import searchTerms |
||||
|
|
||||
|
from openlibrary import bookUrls |
||||
|
|
||||
|
from archive import searchIA |
||||
|
from urllib import quote, unquote |
||||
|
from json import dumps, loads |
||||
|
|
||||
|
from werkzeug.contrib.cache import MemcachedCache |
||||
|
cache = MemcachedCache(['127.0.0.1:11211']) |
||||
|
|
||||
|
import os |
||||
|
|
||||
|
def predict(fieldtype, term): |
||||
|
print fieldtype |
||||
|
print term |
||||
|
if not term: |
||||
|
return "[]" |
||||
|
else: |
||||
|
try: |
||||
|
cs = completers[fieldtype](term.lower()) |
||||
|
except KeyError: |
||||
|
return "[]" |
||||
|
if cs: |
||||
|
return cs |
||||
|
return "[]" |
||||
|
|
||||
|
def predictor(fieldtype): |
||||
|
def inner(request): |
||||
|
params = dict(request.args.items()) |
||||
|
return predict(fieldtype, params["term"]) |
||||
|
return inner |
||||
|
|
||||
|
def cacheit(key, thunk): |
||||
|
""" |
||||
|
Tries to find a cached version of ``key'' |
||||
|
If there is no cached version then it will |
||||
|
evaluate thunk (which must be a generator) |
||||
|
and cache that, then return the result |
||||
|
""" |
||||
|
cached = cache.get(quote(key)) |
||||
|
if cached is None: |
||||
|
result = list(thunk()) |
||||
|
cache.set(quote(key), result) |
||||
|
return result |
||||
|
return cached |
||||
|
|
||||
|
def ClassSearch(configfile=None): |
||||
|
defaults = {"Day", "Building", "Exact Location", "Department"} |
||||
|
app = Flask(__name__) |
||||
|
AppConfig(app, configfile) # Flask-Appconfig is not necessary, but |
||||
|
# highly recommend =) |
||||
|
# https://github.com/mbr/flask-appconfig |
||||
|
Bootstrap(app) |
||||
|
|
||||
|
app.config["scripts"] = "/home/wes/MGOAL/scripts" |
||||
|
app.config["styles"] = "/home/wes/MGOAL/styles" |
||||
|
|
||||
|
@app.route('/favicon.ico') |
||||
|
def favicon(): |
||||
|
return send_from_directory("/srv/http/goal/favicon.ico", |
||||
|
'favicon.ico', mimetype='image/vnd.microsoft.icon') |
||||
|
|
||||
|
|
||||
|
@app.route("/buildpred", methods=("GET", "POST")) |
||||
|
def buildpred(): |
||||
|
return predictbuild(request) |
||||
|
|
||||
|
@app.route("/locpred", methods=("GET", "POST")) |
||||
|
def locpred(): |
||||
|
return predictloc(request) |
||||
|
|
||||
|
@app.route("/daypred", methods=("GET", "POST")) |
||||
|
def daypred(): |
||||
|
return predictday(request) |
||||
|
|
||||
|
@app.route("/deptpred", methods=("GET", "POST")) |
||||
|
def deptpred(): |
||||
|
return predictdept(request) |
||||
|
|
||||
|
@app.route("/titlepred", methods=("GET", "POST")) |
||||
|
def titlepred(): |
||||
|
return predicttitle(request) |
||||
|
|
||||
|
@app.route("/", methods=("GET", "POST")) |
||||
|
def index(): |
||||
|
return render_template("search.html") |
||||
|
|
||||
|
@app.route("/fc", methods=("GET", "POST")) |
||||
|
def fc(): |
||||
|
""" Filter Courses """ |
||||
|
print "trying to get courses" |
||||
|
params = dict(request.args.items()) |
||||
|
for key, val in params.iteritems(): |
||||
|
if val in defaults: |
||||
|
del params[key] |
||||
|
results = searchTerms(params) |
||||
|
return results |
||||
|
|
||||
|
@app.route("/resources", methods=("GET", "POST")) |
||||
|
def resources(): |
||||
|
""" Get Resources """ |
||||
|
notRequired = False |
||||
|
params = loads(dict(request.args.items())["data"]) |
||||
|
print params |
||||
|
author = params["author"] |
||||
|
title = params["title"] |
||||
|
|
||||
|
if ("No Textbooks" in title or |
||||
|
"No Adoption" in title): |
||||
|
return dumps("false") |
||||
|
|
||||
|
# Cache the result of the open library search |
||||
|
openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) |
||||
|
print openlib |
||||
|
|
||||
|
# cache the result of an internet archive search |
||||
|
iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) |
||||
|
print iarchive |
||||
|
|
||||
|
if not (any(openlib) or any(iarchive)): |
||||
|
# We literally could not find ANYTHING |
||||
|
return dumps("false") |
||||
|
|
||||
|
return dumps({ |
||||
|
"iarchive" : iarchive, |
||||
|
"openlib" : openlib |
||||
|
}) |
||||
|
|
||||
|
@app.route("/scripts/<filename>") |
||||
|
def send_script(filename): |
||||
|
return send_from_directory(app.config["scripts"], filename) |
||||
|
|
||||
|
@app.route("/styles/<filename>") |
||||
|
def send_style(filename): |
||||
|
return send_from_directory(app.config["styles"], filename) |
||||
|
return app |
||||
|
|
||||
|
if __name__ == "__main__": |
||||
|
ClassSearch().run(port=8001, debug=True) |
Loading…
Reference in new issue