24 changed files with 2062 additions and 8 deletions
@ -0,0 +1,34 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from urllib import quote |
|||
from json import loads, dumps |
|||
|
|||
import requests as req |
|||
|
|||
searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" |
|||
|
|||
def searchIA(title, author): |
|||
""" |
|||
Do a search on The Internet Archive for a book |
|||
""" |
|||
print "running a search" |
|||
requrl = searchUrl.format(quote(title + " " + author)) |
|||
try: |
|||
results = loads(req.get(requrl).text[9:][0:-1]) |
|||
except ValueError: |
|||
return [] |
|||
|
|||
rownum = results["responseHeader"]["params"]["rows"] |
|||
if rownum < 1: |
|||
print "Couldn't find results for %s %s" % (title, author) |
|||
return [] |
|||
docs = results["response"]["docs"] |
|||
urls = [] |
|||
for result in results["response"]["docs"][0:3]: |
|||
urls.append("https://archive.org/details/%s" % result["identifier"]) |
|||
return urls |
|||
|
|||
|
|||
# Example, search for David Hume's Enquiry Concerning Human Understanding |
|||
#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): |
|||
#print url |
@ -0,0 +1,62 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from sys import argv |
|||
from hashlib import sha1 |
|||
|
|||
def truncate(docid): |
|||
""" |
|||
Truncate a document id to 12 digits |
|||
The document ID should be based on a |
|||
hash of unique identifiers |
|||
""" |
|||
return int(str(docid)[0:12]) |
|||
|
|||
def createResource(textbookInfo, course, dept, coursecode, docid): |
|||
""" |
|||
Create a document associated with a course |
|||
This document contains any/all resources associated |
|||
with that course |
|||
|
|||
example, |
|||
{ |
|||
'books': [], |
|||
'dept': 'COLLAB', |
|||
'code': '2C03', |
|||
'sections': [ |
|||
{ |
|||
'prof': 'Lisa Pender', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Mo' |
|||
}, |
|||
{ |
|||
'prof': 'Staff', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Th' |
|||
} |
|||
], |
|||
'title': 'COLLAB 2C03 - Sociology I' |
|||
} |
|||
""" |
|||
textbooks = textbookInfo(dept.strip(), coursecode.strip()) |
|||
|
|||
# We truncate the id so we can have nicer looking URLs |
|||
# Since the id will be used to point to the resource page for that course |
|||
_id = str(truncate(docid)) |
|||
|
|||
fields = { |
|||
"_id" : _id, |
|||
"textbooks" : textbooks, |
|||
"coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), |
|||
"courseinfo" : course |
|||
#"Syllabus" : "blah" |
|||
} |
|||
try: |
|||
revisions = list(localdb.revisions(_id)) |
|||
if not revisions: |
|||
return localdb.save(fields) |
|||
else: |
|||
rev = dict(revisions[0])["_rev"] |
|||
fields["_rev"] = rev |
|||
return localdb.save(fields) |
|||
except ResourceConflict: |
|||
print "Resource for %s already exists, not creating a new one" % (docid) |
@ -0,0 +1,14 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
# predictive data |
|||
# switch to elasticsearch's prediction |
|||
|
|||
|
|||
|
|||
import database |
|||
import predictions |
|||
|
|||
class GOASearch(object): |
|||
def __init__(self): |
|||
return self |
|||
|
@ -0,0 +1,349 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from sys import argv |
|||
from itertools import chain, islice, izip as zip |
|||
from re import search, sub |
|||
from functools import total_ordering |
|||
|
|||
from sylla import textbookInfo |
|||
from collections import MutableMapping |
|||
|
|||
import datetime as dt |
|||
import lxml.html as lxh |
|||
import requests |
|||
import sys |
|||
import copy |
|||
|
|||
fall = "2159" |
|||
spring_summer = "2165" |
|||
winter = "2161" |
|||
|
|||
# threading stuff |
|||
import Queue as q |
|||
import threading as thd |
|||
|
|||
baseurl = "https://applicants.mcmaster.ca/psp/prepprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" |
|||
|
|||
searchurl = "https://csprd.mcmaster.ca/psc/prcsprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL" |
|||
|
|||
custom_headers = { |
|||
"User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0", |
|||
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8", |
|||
} |
|||
|
|||
courseCodes1 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_SUBJ_SRCH%240&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=5tq9x%2Fjt42mf62Sh5z%2BrjxT0gT15kiIyQ2cecCSmRB4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" |
|||
|
|||
courseCodes2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=SSR_CLSRCH_WRK2_SSR_ALPHANUM_{1}&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=vIUgl6ZXw045S07EPbQw4RDzv7NmKCDdJFdT4CTRQNM%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={2}" |
|||
|
|||
payload2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=%23ICSave&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}" |
|||
|
|||
payload = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&SSR_CLSRCH_WRK_SUBJECT$75$$0={1}&CLASS_SRCH_WRK2_STRM$45$={2}" |
|||
|
|||
|
|||
year = dt.date.today().year |
|||
month = dt.date.today().month |
|||
|
|||
days = { |
|||
"Mo" : 0, |
|||
"Tu" : 1, |
|||
"We" : 2, |
|||
"Th" : 3, |
|||
"Fr" : 4, |
|||
"Sa" : 5, |
|||
"Su" : 6 |
|||
} |
|||
|
|||
day_descs = { |
|||
"Mo" : "Monday Mon Mo", |
|||
"Tu" : "Tuesday Tues Tu Tue", |
|||
"We" : "Wednesday Wed We", |
|||
"Th" : "Thursday Th Thurs", |
|||
"Fr" : "Friday Fr Fri", |
|||
"Sa" : "Saturday Sat Sa", |
|||
"Su" : "Sunday Su Sun", |
|||
"T" : "TBA" |
|||
} |
|||
|
|||
def timeparse(time): |
|||
""" |
|||
Parse the time into numbers |
|||
""" |
|||
if len(time) == 7: |
|||
hour = int(time[0:2]) |
|||
minutes = int(time[3:5]) |
|||
half = time[5:7] |
|||
else: |
|||
hour = int(time[0]) |
|||
minutes = int(time[2:4]) |
|||
half = time[4:6] |
|||
if half == "PM": |
|||
if hour < 12: |
|||
hour = hour + 12 |
|||
|
|||
return (str(hour), str(minutes), half) |
|||
|
|||
class Class(object): |
|||
def __init__(self, dept, title, sections): |
|||
self.title = title.encode("UTF-8") |
|||
self.sections = sections |
|||
self.dept = dept |
|||
|
|||
def __repr__(self): |
|||
return repr((self.title, self.sections)) |
|||
|
|||
def __iter__(self): |
|||
return iter((self.title, sec) for sec in self.sections) |
|||
|
|||
def hasCode(self): |
|||
splitted = self.title.strip().split(" ") |
|||
return ((len(splitted) >= 2) and |
|||
(splitted[0].upper() == splitted[0]) and |
|||
(splitted[1].upper() == splitted[1])) |
|||
|
|||
@property |
|||
def code(self): |
|||
if self.hasCode(): |
|||
return self.title.strip().split(" ")[1].strip() |
|||
return False |
|||
|
|||
@property |
|||
def books(self): |
|||
if self.dept and self.code: |
|||
return textbookInfo(self.dept, self.code, withPrices=True) |
|||
return False |
|||
|
|||
@total_ordering |
|||
class Section(dict): |
|||
def __init__(self, time, loc, prof, sem): |
|||
self.time = time.encode("UTF-8") |
|||
self.loc = loc.encode("UTF-8") |
|||
self.prof = prof.encode("UTF-8") |
|||
self.sem = sem.encode("UTF-8") |
|||
self._date = False |
|||
self._day = False |
|||
|
|||
@property |
|||
def date(self): |
|||
if self.time != "TBA": |
|||
day, start, _, end = self.time.split() |
|||
|
|||
if self._day: |
|||
assert len(self._day) == 2 |
|||
day = self._day |
|||
else: |
|||
day = [day[n:n+2] for n in xrange(0, len(day)-1, 2)] |
|||
|
|||
self._date = (day, timeparse(start), timeparse(end)) |
|||
|
|||
return self._date |
|||
|
|||
return self.time |
|||
|
|||
@property |
|||
def day(self): |
|||
return self.date[0] |
|||
|
|||
@property |
|||
def start(self): |
|||
return self.date[1][0] + self.date[1][1] |
|||
|
|||
def __repr__(self): |
|||
return (""" |
|||
Time = %s, Location = %s, Instructor = %s, Semester Running = %s |
|||
""" % (self.date, self.loc, self.prof, self.sem)) |
|||
def __gt__(self, x): |
|||
if isinstance(self.day, list): |
|||
raise NotImplementedError |
|||
|
|||
if (self.date == "TBA" or |
|||
x.date == "TBA"): |
|||
return False |
|||
|
|||
return ((days[self.day] > days[x.day]) or |
|||
((self.day == x.day) and |
|||
(self.start > x.start))) |
|||
|
|||
def __eq__(self, x): |
|||
return (x.date == self.date and |
|||
x.prof == self.prof and |
|||
x.loc == self.loc and |
|||
x.sem == self.sem) |
|||
|
|||
|
|||
def getStateNum(html): |
|||
""" |
|||
Get the state num from Mosaic |
|||
This is unique to each requester |
|||
""" |
|||
parsed = lxh.fromstring(html) |
|||
return parsed.xpath(".//input[@name=\"ICStateNum\"]")[0].value |
|||
|
|||
def parseSection(section): |
|||
cols = section.xpath(".//td") |
|||
assert len(cols) == 4 |
|||
time, loc, prof, sem = [col.text_content().encode("UTF-8").strip() for col in cols] |
|||
|
|||
classinfo = Section(time, loc, prof, sem) |
|||
return classinfo |
|||
|
|||
def getSectionInfo(table): |
|||
trs = table.xpath(".//tr") |
|||
for tr in trs: |
|||
if tr.xpath("@id") and search(r"SSR_CLSRCH", tr.xpath("@id")[0]): |
|||
yield parseSection(tr) |
|||
|
|||
def parseColumns(subject, html): |
|||
parsed = lxh.fromstring(html) |
|||
|
|||
classInfo = (list(getSectionInfo(table)) for table in |
|||
islice((table for table in parsed.xpath(".//table") |
|||
if table.xpath("@id") and |
|||
search(r"ICField[0-9]+\$scroll", table.xpath("@id")[0])), 1, sys.maxint)) |
|||
|
|||
classNames = ((subject, span.text_content().strip()) for span in parsed.xpath(".//span") |
|||
if span.xpath("@id") and |
|||
search(r"DERIVED_CLSRCH_DESCR", span.xpath("@id")[0])) |
|||
|
|||
return zip(classNames, classInfo) |
|||
|
|||
def getCodes(html): |
|||
parsed = lxh.fromstring(html) |
|||
|
|||
return (code.text_content().encode("UTF-8") for code in |
|||
parsed.xpath("//span") |
|||
if code.xpath("@id") and |
|||
search(r"SSR_CLSRCH_SUBJ_SUBJECT\$[0-9]+", code.xpath("@id")[0])) |
|||
|
|||
class MosReq(object): |
|||
def __init__(self, semester): |
|||
self.semester = semester |
|||
s = requests.Session() |
|||
resp = s.get(baseurl, allow_redirects=True, headers=custom_headers).content |
|||
|
|||
# Let the server set some cookies before doing the searching |
|||
cookies = {} |
|||
for key, val in s.cookies.iteritems(): |
|||
cookies[key] = val |
|||
self.cookies = cookies |
|||
self.statenum = False |
|||
self.codes_ = [] |
|||
|
|||
def getlist(self, subject): |
|||
sys.stderr.write("Getting " + subject + "\n") |
|||
first_req = requests.get(searchurl, cookies=self.cookies).content |
|||
# for some reason Mosaic wants us to request it twice, ?????????????????? |
|||
self.statenum = getStateNum(first_req) |
|||
first_req = requests.post(searchurl, |
|||
data=payload.format(self.statenum, subject, self.semester), |
|||
cookies=self.cookies, |
|||
allow_redirects=False, |
|||
headers=custom_headers).content |
|||
# we make a first request to get the ICStateNum in case it thinks there are too many results |
|||
try: |
|||
self.statenum = getStateNum(first_req) |
|||
except IndexError: |
|||
pass |
|||
if "Your search will return over" in first_req: |
|||
|
|||
return requests.post(searchurl, |
|||
data=payload2.format(self.statenum, self.semester), |
|||
cookies=self.cookies, |
|||
allow_redirects=False, |
|||
headers=custom_headers).content |
|||
else: |
|||
return first_req |
|||
|
|||
def classes(self, subject): |
|||
return list(parseColumns(subject, self.getlist(subject))) |
|||
|
|||
def getCodes(self, letter): |
|||
sys.stderr.write("Getting letter " + letter + "\n") |
|||
first_req = requests.get(searchurl, cookies=self.cookies).content |
|||
self.statenum = getStateNum(first_req) |
|||
|
|||
self.statenum = getStateNum(requests.post(searchurl, |
|||
data=courseCodes1.format(self.statenum, self.semester), |
|||
cookies=self.cookies, |
|||
headers=custom_headers).content) |
|||
|
|||
return getCodes(requests.post(searchurl, |
|||
data=courseCodes2.format(self.statenum, letter, self.semester), |
|||
cookies=self.cookies, |
|||
allow_redirects=False, |
|||
headers=custom_headers).content) |
|||
@property |
|||
def codes(self): |
|||
if not self.codes_: |
|||
self.codes_ = list(chain.from_iterable( |
|||
map((lambda l: |
|||
self.getCodes(chr(l))), |
|||
xrange(65, 91)))) |
|||
return self.codes_ |
|||
|
|||
def request(codes, lists, semester): |
|||
requester = MosReq(semester) |
|||
while not codes.empty(): |
|||
code = codes.get() |
|||
try: |
|||
lists.put(requester.classes(code)) |
|||
except: |
|||
codes.task_done() |
|||
return |
|||
codes.task_done() |
|||
|
|||
|
|||
class CourseInfo(object): |
|||
def __init__(self, threadcount, semester): |
|||
self._codes = False |
|||
self.threadcount = threadcount |
|||
self.semester = semester |
|||
|
|||
@property |
|||
def codes(self): |
|||
if not self._codes: |
|||
req = MosReq(self.semester) |
|||
self._codes = req.codes |
|||
return self._codes |
|||
|
|||
def classes(self): |
|||
qcodes = q.Queue() |
|||
for code in self.codes: |
|||
qcodes.put(code) |
|||
lists = q.Queue() |
|||
threads = [] |
|||
thread = None |
|||
for i in xrange(self.threadcount): |
|||
thread = thd.Thread(group=None, target=request, args=(qcodes, lists, self.semester)) |
|||
threads.append(thread) |
|||
thread.start() |
|||
qcodes.join() |
|||
for t in threads: |
|||
t.join() |
|||
|
|||
sections = [] |
|||
while not lists.empty(): |
|||
sections.append(lists.get()) |
|||
|
|||
for cl in chain.from_iterable(sections): |
|||
new_sections = [] |
|||
for sec in cl[1]: |
|||
if len(sec.day) > 1: |
|||
for day in sec.day: |
|||
new_sections.append(copy.deepcopy(sec)) |
|||
new_sections[-1]._day = day |
|||
else: |
|||
sec._day = sec.day[0] |
|||
new_sections.append(sec) |
|||
yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections)) |
|||
|
|||
def getCourses(semester, threadcount=10): |
|||
return CourseInfo(threadcount, semester).classes() |
|||
|
|||
def allCourses(): |
|||
return chain.from_iterable( |
|||
(getCourses(sem, threadcount=10) |
|||
for sem in (fall, winter, spring_summer))) |
|||
|
|||
#for course in allCourses(): |
|||
#sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, course.books)) |
|||
#print course.sections |
@ -0,0 +1,9 @@ |
|||
from oersearch import Search |
|||
from classes import getCourses |
|||
from sylla import getTextbooks |
|||
|
|||
mcmasterSearch = Search("McMaster") |
|||
|
|||
mcmasterSearch.setup(getCourses) |
|||
|
|||
mcmasterSearch.run() |
@ -0,0 +1,117 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from sys import argv |
|||
from itertools import chain, islice, izip_longest, izip as zip |
|||
from re import search, sub |
|||
from functools import total_ordering |
|||
from re import sub |
|||
|
|||
import datetime as dt |
|||
import lxml.html as lxh |
|||
import requests |
|||
|
|||
# Purpose of this module is to download and parse syllabi from various departments |
|||
# In order to be corellated with individual courses |
|||
|
|||
class Price(object): |
|||
def __init__(self, amnt, status): |
|||
self.dollars = float(amnt[1:]) |
|||
self.status = status |
|||
|
|||
def __repr__(self): |
|||
return "$%s %s" % (repr(self.dollars), self.status) |
|||
|
|||
|
|||
class Book(object): |
|||
def __init__(self, title, price): |
|||
self.title = title |
|||
self.price = price |
|||
|
|||
def __repr__(self): |
|||
return '["%s", "%s"]' % (self.title, repr(self.price)) |
|||
|
|||
|
|||
def grouper(n, iterable, fillvalue=None): |
|||
"grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" |
|||
args = [iter(iterable)] * n |
|||
return izip_longest(fillvalue=fillvalue, *args) |
|||
|
|||
searchUrl = "https://campusstore.mcmaster.ca/cgi-mcm/ws/txsub.pl?wsDEPTG1=%s&wsDEPTDESC1=&wsCOURSEG1=%s&crit_cnt=1" |
|||
|
|||
def normalize(word): |
|||
if len(word) > 1: |
|||
return ("%s%s" % |
|||
(word[0].upper(), |
|||
"".join(word[1:]).lower())) |
|||
return word |
|||
|
|||
def parseAuthor(author): |
|||
split = author.split(" ") |
|||
if len(split) <= 1: |
|||
return author |
|||
lastname = split[0] |
|||
firstname = split[1] |
|||
return "%s %s" % (firstname, lastname) |
|||
|
|||
def normwords(phrase): |
|||
words = phrase.split(" ") |
|||
return " ".join(map(normalize, words)) |
|||
|
|||
def books(dept, code, withPrices): |
|||
""" |
|||
Snatch me up a book title or three |
|||
""" |
|||
req = searchUrl % (dept, code) |
|||
|
|||
html = requests.get(req).text |
|||
|
|||
parsed = lxh.fromstring(html) |
|||
|
|||
pricelist = prices(parsed) |
|||
|
|||
for div in parsed.xpath(".//div"): |
|||
if (div.attrib.has_key("id") and |
|||
"prodDesc" in div.attrib["id"]): |
|||
|
|||
textbook = div.text_content() |
|||
author = sub(r',', '', |
|||
"".join( |
|||
(div.getparent() |
|||
.xpath(".//span[@class='inline']") |
|||
[0].text_content() |
|||
.split(":")[1:])).strip()) |
|||
price = pricelist.pop() |
|||
if withPrices: |
|||
yield (normwords(textbook), normwords(author), repr(price)) |
|||
else: |
|||
yield (normwords(textbook), normwords(author)) |
|||
|
|||
def prices(html): |
|||
""" |
|||
Get the prices from a search result page |
|||
""" |
|||
ps = [ |
|||
p.getparent().text_content().split()[0] |
|||
for p in html.xpath("//p/input[@type='checkbox']") |
|||
] |
|||
|
|||
try: |
|||
amts, stats = zip(*list(reversed(list(grouper(2, ps))))) |
|||
return map(Price, amts, stats) |
|||
except ValueError: |
|||
return [] |
|||
|
|||
def textbookInfo(dept, code, withPrices=False): |
|||
""" |
|||
Return all the textbooks for a course |
|||
""" |
|||
return list(books(dept, code, withPrices)) |
|||
|
|||
def humanities(): |
|||
""" |
|||
Download humanities syllabi |
|||
""" |
|||
return [] |
|||
|
|||
# Example, getting the course info for Personality Theory (PSYCH = Department, 2B03 = Course code) |
|||
# print list(courseInfo("PSYCH", "2B03")) |
@ -0,0 +1,24 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from urllib import quote |
|||
from json import loads, dumps |
|||
|
|||
import requests as req |
|||
|
|||
#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" |
|||
searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' |
|||
|
|||
def bookUrls(title, author): |
|||
print title, author |
|||
if ":" in title: |
|||
title = title.split(":")[0] |
|||
requrl = searchurl % (quote(author), quote(title)) |
|||
results = loads(req.get(requrl).text) |
|||
for result in results["docs"][0:2]: |
|||
if result.has_key("edition_key"): |
|||
yield "https://openlibrary.org/books/%s" % result["edition_key"][0] |
|||
|
|||
# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' |
|||
|
|||
#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): |
|||
#print book |
@ -0,0 +1,153 @@ |
|||
##! /usr/bin/python2 |
|||
from itertools import groupby, chain |
|||
from sys import stdout |
|||
from functools import partial |
|||
from json import dumps |
|||
|
|||
def gensymer(): |
|||
n = [0] |
|||
def inner(): |
|||
result = str(n[0]) |
|||
n[0] += 1 |
|||
return result |
|||
return inner |
|||
|
|||
gensym = gensymer() |
|||
|
|||
def printTrie(graph, prev, trie, weight): |
|||
new_node = str(gensym()) |
|||
graph.node(new_node, "%s" % trie.letter) |
|||
graph.edge(prev, new_node, label="%.2f" % weight) |
|||
if not trie.children: |
|||
return |
|||
for child, weight in zip(trie.children, trie.ws): |
|||
printTrie(graph, new_node, child, weight) |
|||
|
|||
|
|||
class Trie(object): |
|||
def __init__(self, letter, children, ws): |
|||
self.letter = letter |
|||
self.children = children |
|||
self.ws = ws |
|||
|
|||
def probweight(suffixes): |
|||
weights = [float(s["value"]) for s in suffixes] |
|||
s = float(sum(weights)) |
|||
ws = [w/s for w in weights] |
|||
return ws |
|||
|
|||
def buildtrie(trie, suffixes): |
|||
""" |
|||
Build a trie, also known as a prefix tree, of all the possible completions |
|||
""" |
|||
trie.children = [] |
|||
for letter, suffs in suffixes: |
|||
ped = partition(suffs) |
|||
if any(map(lambda p: p[0], ped)): |
|||
# check if there are any children |
|||
trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) |
|||
else: |
|||
# we've reached the end of this word so just include the final letter |
|||
# [1] = there is a probability of 1 of reaching this single leaf node, |
|||
# since it is the only possible completion here |
|||
trie.children.append(Trie(letter, [], [1])) |
|||
return trie |
|||
|
|||
|
|||
def keyf(x): |
|||
if not x["key"]: |
|||
return "" |
|||
return x["key"][0] |
|||
|
|||
def tails(words): |
|||
for word in words: |
|||
yield { |
|||
"key" : word["key"][1:], |
|||
"value" : word["value"] |
|||
} |
|||
|
|||
def partition(words): |
|||
""" |
|||
Partition the words into different prefixes based on the first character |
|||
""" |
|||
groups = [ |
|||
(g[0], list(tails(g[1]))) |
|||
for g in groupby( |
|||
sorted(words, key=keyf), |
|||
key=keyf) |
|||
] |
|||
return groups |
|||
|
|||
|
|||
def flatten_helper(letter, trie): |
|||
return ([letter + child.letter for |
|||
child in trie.children], trie.children) |
|||
|
|||
def flatten(trie): |
|||
if not trie.children: |
|||
return trie.letter |
|||
prefixes, suffixes = flatten_helper(trie.letter, trie) |
|||
return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] |
|||
|
|||
def flattenlist(xs): |
|||
locs = [] |
|||
for x in xs: |
|||
if not isinstance(x, list): |
|||
locs.append(x) |
|||
else: |
|||
locs.extend(flattenlist(x)) |
|||
return locs |
|||
|
|||
def matchc(trie, prefix): |
|||
c = None |
|||
if len(prefix) > 1: |
|||
c = prefix[0] |
|||
else: |
|||
c = prefix |
|||
return [ch for ch in trie.children if ch.letter == c] |
|||
|
|||
def match(trie, word): |
|||
if not word: |
|||
return [] |
|||
m = matchc(trie, word[0]) |
|||
if not m: |
|||
return [] |
|||
else: |
|||
return [m[0]] + match(m[0], word[1:]) |
|||
|
|||
def complete(trie, word): |
|||
m = match(trie, word) |
|||
if len(word) != len(m): |
|||
return False |
|||
completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] |
|||
if len(completions) > 10: |
|||
return dumps(completions[0:10]) |
|||
return dumps(completions) |
|||
|
|||
def sortTrie(trie): |
|||
""" |
|||
Sort the children of each node in descending order |
|||
of the probability that each child would be the completion |
|||
of whatever that word is |
|||
""" |
|||
if not trie.children: |
|||
return |
|||
sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) |
|||
trie.children = [x[0] for x in sortedChilds] |
|||
trie.ws = [x[1] for x in sortedChilds] |
|||
for child in trie.children: |
|||
sortTrie(child) |
|||
|
|||
def toTrie(words): |
|||
for word in words: |
|||
word["key"] = word["key"].lower() |
|||
trie = buildtrie(Trie("", [], [1]), partition(words)) |
|||
trie.ws = [1]*len(trie.children) |
|||
sortTrie(trie) |
|||
return trie |
|||
|
|||
def testkey(w): |
|||
return { |
|||
"key" : w, |
|||
"value" : "1" |
|||
} |
@ -0,0 +1,237 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
import elasticsearch |
|||
|
|||
from elasticsearch_dsl import FacetedSearch, Search, Q |
|||
from elasticsearch_dsl.aggs import Terms, DateHistogram |
|||
from sys import exit, stderr |
|||
from json import dumps, loads |
|||
from itertools import chain, imap |
|||
|
|||
from hashlib import sha1 |
|||
|
|||
from textbookExceptions import UnIndexable |
|||
|
|||
from mcmaster.classes import allCourses |
|||
|
|||
# Generic instance of elasticsearch right now |
|||
es = elasticsearch.Elasticsearch() |
|||
|
|||
def summarize(text): |
|||
splitted = text.split(" ") |
|||
if len(splitted) > 4: |
|||
return " ".join(splitted[0:4]) + ".." |
|||
return text |
|||
|
|||
def sectionToJSON(section): |
|||
return { |
|||
"prof" : section.prof, |
|||
"sem" : section.sem, |
|||
"day" : section.day |
|||
} |
|||
|
|||
def classToJSON(clss): |
|||
return { |
|||
"title" : clss.title, |
|||
"sections" : map(sectionToJSON, clss.sections), |
|||
"dept" : clss.dept, |
|||
"code" : clss.code, |
|||
"books" : list(clss.books) if clss.books else [] |
|||
} |
|||
|
|||
|
|||
def truncate(docid): |
|||
""" |
|||
Truncate a document id to 12 digits |
|||
The document ID should be based on a |
|||
hash of unique identifiers |
|||
""" |
|||
return int(str(docid)[0:12]) |
|||
|
|||
def hashsec(course): |
|||
""" |
|||
Hash a course into a usable id |
|||
""" |
|||
if not course["code"]: |
|||
code = "" |
|||
else: |
|||
code = course["code"] |
|||
if not course["title"]: |
|||
title = "" |
|||
else: |
|||
title = course["title"] |
|||
|
|||
if not course["sections"] or len(course["sections"]) < 1: |
|||
course["sections"][0] = "" |
|||
|
|||
if not (code or title): |
|||
raise UnIndexable(course) |
|||
|
|||
h = sha1() |
|||
h.update(code + title + course["sections"][0]["sem"]) |
|||
return int(h.hexdigest(), 16) |
|||
|
|||
def createIndex(name): |
|||
""" |
|||
This creates a new index in elasticsearch |
|||
An index is like a schema in a regular database |
|||
Create an elasticsearch index |
|||
|
|||
""" |
|||
indices = elasticsearch.client.IndicesClient(es) |
|||
|
|||
print indices.create(name) |
|||
with open("./course.json", "r") as mapping: |
|||
print indices.put_mapping("course", loads(mapping.read()), name) |
|||
|
|||
def indexListing(course): |
|||
""" |
|||
Index a specific course in the database (using the courses index) |
|||
example, |
|||
{ |
|||
'books': [], |
|||
'dept': 'COLLAB', |
|||
'code': '2C03', |
|||
'sections': [ |
|||
{ |
|||
'prof': 'Lisa Pender', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Mo' |
|||
}, |
|||
{ |
|||
'prof': 'Staff', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Th' |
|||
} |
|||
], |
|||
'title': 'COLLAB 2C03 - Sociology I' |
|||
} |
|||
|
|||
""" |
|||
courseID = hashsec(course) |
|||
print es.index(index="oersearch", |
|||
doc_type="course", |
|||
id=courseID, |
|||
body=course) |
|||
|
|||
# For every course we index, we also create a resource for it |
|||
# This should be an idempotent operation because we're putting it in couchdb |
|||
# And we're using the id obtained from the hash function, so it should just update the document |
|||
# no need to delete anything |
|||
#try: |
|||
#courseDept = course[0]["title"].strip().split(" ")[0].strip() |
|||
#courseCode = course[0]["title"].strip().split(" ")[1].strip() |
|||
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) |
|||
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) |
|||
#except: |
|||
#print "Couldn't create the resource associated with %s" % course |
|||
|
|||
def termSearch(field): |
|||
""" |
|||
Make a term search (exact match) |
|||
""" |
|||
def t(term): |
|||
q = Q("term", |
|||
**{ |
|||
"sections."+field : term |
|||
}) |
|||
return q |
|||
return t |
|||
|
|||
def search(field): |
|||
""" |
|||
Make a match search |
|||
""" |
|||
def s(term): |
|||
q = Q("match", |
|||
**{ |
|||
field : term |
|||
}) |
|||
return q |
|||
return s |
|||
|
|||
def join(x, y): |
|||
""" |
|||
Join two queries |
|||
""" |
|||
return x & y |
|||
|
|||
def filterSections(secs): |
|||
""" |
|||
Get rid of tutorial sections |
|||
because they almost always have "Staff" as the instructor |
|||
This is just a heuristic of course |
|||
""" |
|||
filtered = [s for s in secs.sections if "Staff" not in s.prof] |
|||
if len(filtered) > 0: |
|||
return filtered |
|||
return False |
|||
|
|||
def searchTerms(terms): |
|||
""" |
|||
Run a search for courses |
|||
""" |
|||
|
|||
# A list of all the queries we want to run |
|||
qs = [searchers[field](term) for |
|||
field, term in |
|||
terms.iteritems() if |
|||
term and searchers.has_key(field)] |
|||
|
|||
if not qs: |
|||
# No queries = no results |
|||
return dumps([]) |
|||
|
|||
# Reduce joins all of the queries into one query |
|||
# It will search for the conjunction of all of them |
|||
# So that means it cares about each query equally |
|||
q = reduce(join, qs) |
|||
|
|||
s = (Search(using=es, index="oersearch") |
|||
.query(q))[0:100] # only return up to 100 results for now |
|||
|
|||
results = s.execute() |
|||
|
|||
filtered = [ |
|||
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials |
|||
for secs in results |
|||
if filterSections(secs) |
|||
] |
|||
results = [] |
|||
for obj, secs in filtered: |
|||
# Add the truncated course id |
|||
# This is used to point to the resource page for that course |
|||
secs["id"] = truncate(obj.meta.id) |
|||
secs["title"] = obj.title |
|||
if obj["dept"] not in secs["title"]: |
|||
secs["dept"] = obj.dept |
|||
if obj.books: |
|||
secs["books"] = [ |
|||
{ |
|||
"booktitle" : summarize(book[0].encode("ASCII")), |
|||
"bookauthor" : book[1].encode("ASCII"), |
|||
"bookprice" : book[2].encode("ASCII") |
|||
} |
|||
for book in obj.books |
|||
] |
|||
else: |
|||
secs["books"] = "" |
|||
results.append(secs) |
|||
|
|||
return dumps(results) |
|||
|
|||
|
|||
searchers = { |
|||
"title" : search("title"), |
|||
"loc" : search("loc"), |
|||
"time" : search("time"), |
|||
"prof" : search("prof"), |
|||
"day" : search("day"), |
|||
} |
|||
|
|||
#print searchTerms({"title" : "PHILOS"}) |
|||
|
|||
#for c in imap(classToJSON, allCourses()): |
|||
#try: |
|||
#print indexListing(c) |
|||
#except UnIndexable as e: |
@ -0,0 +1,34 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from urllib import quote |
|||
from json import loads, dumps |
|||
|
|||
import requests as req |
|||
|
|||
searchUrl = "https://archive.org/advancedsearch.php?q={0}&fl%5B%5D=avg_rating&fl%5B%5D=description&fl%5B%5D=identifier&fl%5B%5D=type&sort%5B%5D=&sort%5B%5D=&sort%5B%5D=&rows=50&page=1&output=json&callback=callback&save=yes#raw" |
|||
|
|||
def searchIA(title, author): |
|||
""" |
|||
Do a search on The Internet Archive for a book |
|||
""" |
|||
print "running a search" |
|||
requrl = searchUrl.format(quote(title + " " + author)) |
|||
try: |
|||
results = loads(req.get(requrl).text[9:][0:-1]) |
|||
except ValueError: |
|||
return [] |
|||
|
|||
rownum = results["responseHeader"]["params"]["rows"] |
|||
if rownum < 1: |
|||
print "Couldn't find results for %s %s" % (title, author) |
|||
return [] |
|||
docs = results["response"]["docs"] |
|||
urls = [] |
|||
for result in results["response"]["docs"][0:3]: |
|||
urls.append("https://archive.org/details/%s" % result["identifier"]) |
|||
return urls |
|||
|
|||
|
|||
# Example, search for David Hume's Enquiry Concerning Human Understanding |
|||
#for url in searchIA("Hume", "Enquiry Concerning Human Understanding"): |
|||
#print url |
@ -0,0 +1,62 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from sys import argv |
|||
from hashlib import sha1 |
|||
|
|||
def truncate(docid): |
|||
""" |
|||
Truncate a document id to 12 digits |
|||
The document ID should be based on a |
|||
hash of unique identifiers |
|||
""" |
|||
return int(str(docid)[0:12]) |
|||
|
|||
def createResource(textbookInfo, course, dept, coursecode, docid): |
|||
""" |
|||
Create a document associated with a course |
|||
This document contains any/all resources associated |
|||
with that course |
|||
|
|||
example, |
|||
{ |
|||
'books': [], |
|||
'dept': 'COLLAB', |
|||
'code': '2C03', |
|||
'sections': [ |
|||
{ |
|||
'prof': 'Lisa Pender', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Mo' |
|||
}, |
|||
{ |
|||
'prof': 'Staff', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Th' |
|||
} |
|||
], |
|||
'title': 'COLLAB 2C03 - Sociology I' |
|||
} |
|||
""" |
|||
textbooks = textbookInfo(dept.strip(), coursecode.strip()) |
|||
|
|||
# We truncate the id so we can have nicer looking URLs |
|||
# Since the id will be used to point to the resource page for that course |
|||
_id = str(truncate(docid)) |
|||
|
|||
fields = { |
|||
"_id" : _id, |
|||
"textbooks" : textbooks, |
|||
"coursetitle" : "%s %s" % (dept.strip(), coursecode.strip()), |
|||
"courseinfo" : course |
|||
#"Syllabus" : "blah" |
|||
} |
|||
try: |
|||
revisions = list(localdb.revisions(_id)) |
|||
if not revisions: |
|||
return localdb.save(fields) |
|||
else: |
|||
rev = dict(revisions[0])["_rev"] |
|||
fields["_rev"] = rev |
|||
return localdb.save(fields) |
|||
except ResourceConflict: |
|||
print "Resource for %s already exists, not creating a new one" % (docid) |
@ -0,0 +1,14 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
# predictive data |
|||
# switch to elasticsearch's prediction |
|||
|
|||
|
|||
|
|||
import database |
|||
import predictions |
|||
|
|||
class GOASearch(object): |
|||
def __init__(self): |
|||
return self |
|||
|
@ -0,0 +1,24 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from urllib import quote |
|||
from json import loads, dumps |
|||
|
|||
import requests as req |
|||
|
|||
#query = "https://openlibrary.org/query.json?type=/type/edition&title=%s&author=%s" |
|||
searchurl = 'http://openlibrary.org/search.json?author=%s&title=%s' |
|||
|
|||
def bookUrls(title, author): |
|||
print title, author |
|||
if ":" in title: |
|||
title = title.split(":")[0] |
|||
requrl = searchurl % (quote(author), quote(title)) |
|||
results = loads(req.get(requrl).text) |
|||
for result in results["docs"][0:2]: |
|||
if result.has_key("edition_key"): |
|||
yield "https://openlibrary.org/books/%s" % result["edition_key"][0] |
|||
|
|||
# 'http://openlibrary.org/query.json?type=/type/edition&title=The+Personality+Puzzle' |
|||
|
|||
#for book in bookUrls("Philosophy Of Physics", "Tim Maudlin"): |
|||
#print book |
@ -0,0 +1,153 @@ |
|||
##! /usr/bin/python2 |
|||
from itertools import groupby, chain |
|||
from sys import stdout |
|||
from functools import partial |
|||
from json import dumps |
|||
|
|||
def gensymer(): |
|||
n = [0] |
|||
def inner(): |
|||
result = str(n[0]) |
|||
n[0] += 1 |
|||
return result |
|||
return inner |
|||
|
|||
gensym = gensymer() |
|||
|
|||
def printTrie(graph, prev, trie, weight): |
|||
new_node = str(gensym()) |
|||
graph.node(new_node, "%s" % trie.letter) |
|||
graph.edge(prev, new_node, label="%.2f" % weight) |
|||
if not trie.children: |
|||
return |
|||
for child, weight in zip(trie.children, trie.ws): |
|||
printTrie(graph, new_node, child, weight) |
|||
|
|||
|
|||
class Trie(object): |
|||
def __init__(self, letter, children, ws): |
|||
self.letter = letter |
|||
self.children = children |
|||
self.ws = ws |
|||
|
|||
def probweight(suffixes): |
|||
weights = [float(s["value"]) for s in suffixes] |
|||
s = float(sum(weights)) |
|||
ws = [w/s for w in weights] |
|||
return ws |
|||
|
|||
def buildtrie(trie, suffixes): |
|||
""" |
|||
Build a trie, also known as a prefix tree, of all the possible completions |
|||
""" |
|||
trie.children = [] |
|||
for letter, suffs in suffixes: |
|||
ped = partition(suffs) |
|||
if any(map(lambda p: p[0], ped)): |
|||
# check if there are any children |
|||
trie.children.append(buildtrie(Trie(letter, [], probweight(suffs)), partition(suffs))) |
|||
else: |
|||
# we've reached the end of this word so just include the final letter |
|||
# [1] = there is a probability of 1 of reaching this single leaf node, |
|||
# since it is the only possible completion here |
|||
trie.children.append(Trie(letter, [], [1])) |
|||
return trie |
|||
|
|||
|
|||
def keyf(x): |
|||
if not x["key"]: |
|||
return "" |
|||
return x["key"][0] |
|||
|
|||
def tails(words): |
|||
for word in words: |
|||
yield { |
|||
"key" : word["key"][1:], |
|||
"value" : word["value"] |
|||
} |
|||
|
|||
def partition(words): |
|||
""" |
|||
Partition the words into different prefixes based on the first character |
|||
""" |
|||
groups = [ |
|||
(g[0], list(tails(g[1]))) |
|||
for g in groupby( |
|||
sorted(words, key=keyf), |
|||
key=keyf) |
|||
] |
|||
return groups |
|||
|
|||
|
|||
def flatten_helper(letter, trie): |
|||
return ([letter + child.letter for |
|||
child in trie.children], trie.children) |
|||
|
|||
def flatten(trie): |
|||
if not trie.children: |
|||
return trie.letter |
|||
prefixes, suffixes = flatten_helper(trie.letter, trie) |
|||
return [flatten(Trie(p, s2.children, s2.ws)) for p, s2 in zip(prefixes, suffixes)] |
|||
|
|||
def flattenlist(xs): |
|||
locs = [] |
|||
for x in xs: |
|||
if not isinstance(x, list): |
|||
locs.append(x) |
|||
else: |
|||
locs.extend(flattenlist(x)) |
|||
return locs |
|||
|
|||
def matchc(trie, prefix): |
|||
c = None |
|||
if len(prefix) > 1: |
|||
c = prefix[0] |
|||
else: |
|||
c = prefix |
|||
return [ch for ch in trie.children if ch.letter == c] |
|||
|
|||
def match(trie, word): |
|||
if not word: |
|||
return [] |
|||
m = matchc(trie, word[0]) |
|||
if not m: |
|||
return [] |
|||
else: |
|||
return [m[0]] + match(m[0], word[1:]) |
|||
|
|||
def complete(trie, word): |
|||
m = match(trie, word) |
|||
if len(word) != len(m): |
|||
return False |
|||
completions = [word+x[1:] for x in flattenlist(flatten(m[-1]))] |
|||
if len(completions) > 10: |
|||
return dumps(completions[0:10]) |
|||
return dumps(completions) |
|||
|
|||
def sortTrie(trie): |
|||
""" |
|||
Sort the children of each node in descending order |
|||
of the probability that each child would be the completion |
|||
of whatever that word is |
|||
""" |
|||
if not trie.children: |
|||
return |
|||
sortedChilds = sorted(zip(trie.children, trie.ws), key=lambda x: x[1], reverse=True) |
|||
trie.children = [x[0] for x in sortedChilds] |
|||
trie.ws = [x[1] for x in sortedChilds] |
|||
for child in trie.children: |
|||
sortTrie(child) |
|||
|
|||
def toTrie(words): |
|||
for word in words: |
|||
word["key"] = word["key"].lower() |
|||
trie = buildtrie(Trie("", [], [1]), partition(words)) |
|||
trie.ws = [1]*len(trie.children) |
|||
sortTrie(trie) |
|||
return trie |
|||
|
|||
def testkey(w): |
|||
return { |
|||
"key" : w, |
|||
"value" : "1" |
|||
} |
@ -0,0 +1,237 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
import elasticsearch |
|||
|
|||
from elasticsearch_dsl import FacetedSearch, Search, Q |
|||
from elasticsearch_dsl.aggs import Terms, DateHistogram |
|||
from sys import exit, stderr |
|||
from json import dumps, loads |
|||
from itertools import chain, imap |
|||
|
|||
from hashlib import sha1 |
|||
|
|||
from textbookExceptions import UnIndexable |
|||
|
|||
from mcmaster.classes import allCourses |
|||
|
|||
# Generic instance of elasticsearch right now |
|||
es = elasticsearch.Elasticsearch() |
|||
|
|||
def summarize(text): |
|||
splitted = text.split(" ") |
|||
if len(splitted) > 4: |
|||
return " ".join(splitted[0:4]) + ".." |
|||
return text |
|||
|
|||
def sectionToJSON(section): |
|||
return { |
|||
"prof" : section.prof, |
|||
"sem" : section.sem, |
|||
"day" : section.day |
|||
} |
|||
|
|||
def classToJSON(clss): |
|||
return { |
|||
"title" : clss.title, |
|||
"sections" : map(sectionToJSON, clss.sections), |
|||
"dept" : clss.dept, |
|||
"code" : clss.code, |
|||
"books" : list(clss.books) if clss.books else [] |
|||
} |
|||
|
|||
|
|||
def truncate(docid): |
|||
""" |
|||
Truncate a document id to 12 digits |
|||
The document ID should be based on a |
|||
hash of unique identifiers |
|||
""" |
|||
return int(str(docid)[0:12]) |
|||
|
|||
def hashsec(course): |
|||
""" |
|||
Hash a course into a usable id |
|||
""" |
|||
if not course["code"]: |
|||
code = "" |
|||
else: |
|||
code = course["code"] |
|||
if not course["title"]: |
|||
title = "" |
|||
else: |
|||
title = course["title"] |
|||
|
|||
if not course["sections"] or len(course["sections"]) < 1: |
|||
course["sections"][0] = "" |
|||
|
|||
if not (code or title): |
|||
raise UnIndexable(course) |
|||
|
|||
h = sha1() |
|||
h.update(code + title + course["sections"][0]["sem"]) |
|||
return int(h.hexdigest(), 16) |
|||
|
|||
def createIndex(name): |
|||
""" |
|||
This creates a new index in elasticsearch |
|||
An index is like a schema in a regular database |
|||
Create an elasticsearch index |
|||
|
|||
""" |
|||
indices = elasticsearch.client.IndicesClient(es) |
|||
|
|||
print indices.create(name) |
|||
with open("./course.json", "r") as mapping: |
|||
print indices.put_mapping("course", loads(mapping.read()), name) |
|||
|
|||
def indexListing(course): |
|||
""" |
|||
Index a specific course in the database (using the courses index) |
|||
example, |
|||
{ |
|||
'books': [], |
|||
'dept': 'COLLAB', |
|||
'code': '2C03', |
|||
'sections': [ |
|||
{ |
|||
'prof': 'Lisa Pender', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Mo' |
|||
}, |
|||
{ |
|||
'prof': 'Staff', |
|||
'sem': '2015/09/08 - 2015/12/08', |
|||
'day': 'Th' |
|||
} |
|||
], |
|||
'title': 'COLLAB 2C03 - Sociology I' |
|||
} |
|||
|
|||
""" |
|||
courseID = hashsec(course) |
|||
print es.index(index="oersearch", |
|||
doc_type="course", |
|||
id=courseID, |
|||
body=course) |
|||
|
|||
# For every course we index, we also create a resource for it |
|||
# This should be an idempotent operation because we're putting it in couchdb |
|||
# And we're using the id obtained from the hash function, so it should just update the document |
|||
# no need to delete anything |
|||
#try: |
|||
#courseDept = course[0]["title"].strip().split(" ")[0].strip() |
|||
#courseCode = course[0]["title"].strip().split(" ")[1].strip() |
|||
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode) |
|||
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID) |
|||
#except: |
|||
#print "Couldn't create the resource associated with %s" % course |
|||
|
|||
def termSearch(field): |
|||
""" |
|||
Make a term search (exact match) |
|||
""" |
|||
def t(term): |
|||
q = Q("term", |
|||
**{ |
|||
"sections."+field : term |
|||
}) |
|||
return q |
|||
return t |
|||
|
|||
def search(field): |
|||
""" |
|||
Make a match search |
|||
""" |
|||
def s(term): |
|||
q = Q("match", |
|||
**{ |
|||
field : term |
|||
}) |
|||
return q |
|||
return s |
|||
|
|||
def join(x, y): |
|||
""" |
|||
Join two queries |
|||
""" |
|||
return x & y |
|||
|
|||
def filterSections(secs): |
|||
""" |
|||
Get rid of tutorial sections |
|||
because they almost always have "Staff" as the instructor |
|||
This is just a heuristic of course |
|||
""" |
|||
filtered = [s for s in secs.sections if "Staff" not in s.prof] |
|||
if len(filtered) > 0: |
|||
return filtered |
|||
return False |
|||
|
|||
def searchTerms(terms): |
|||
""" |
|||
Run a search for courses |
|||
""" |
|||
|
|||
# A list of all the queries we want to run |
|||
qs = [searchers[field](term) for |
|||
field, term in |
|||
terms.iteritems() if |
|||
term and searchers.has_key(field)] |
|||
|
|||
if not qs: |
|||
# No queries = no results |
|||
return dumps([]) |
|||
|
|||
# Reduce joins all of the queries into one query |
|||
# It will search for the conjunction of all of them |
|||
# So that means it cares about each query equally |
|||
q = reduce(join, qs) |
|||
|
|||
s = (Search(using=es, index="oersearch") |
|||
.query(q))[0:100] # only return up to 100 results for now |
|||
|
|||
results = s.execute() |
|||
|
|||
filtered = [ |
|||
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials |
|||
for secs in results |
|||
if filterSections(secs) |
|||
] |
|||
results = [] |
|||
for obj, secs in filtered: |
|||
# Add the truncated course id |
|||
# This is used to point to the resource page for that course |
|||
secs["id"] = truncate(obj.meta.id) |
|||
secs["title"] = obj.title |
|||
if obj["dept"] not in secs["title"]: |
|||
secs["dept"] = obj.dept |
|||
if obj.books: |
|||
secs["books"] = [ |
|||
{ |
|||
"booktitle" : summarize(book[0].encode("ASCII")), |
|||
"bookauthor" : book[1].encode("ASCII"), |
|||
"bookprice" : book[2].encode("ASCII") |
|||
} |
|||
for book in obj.books |
|||
] |
|||
else: |
|||
secs["books"] = "" |
|||
results.append(secs) |
|||
|
|||
return dumps(results) |
|||
|
|||
|
|||
searchers = { |
|||
"title" : search("title"), |
|||
"loc" : search("loc"), |
|||
"time" : search("time"), |
|||
"prof" : search("prof"), |
|||
"day" : search("day"), |
|||
} |
|||
|
|||
#print searchTerms({"title" : "PHILOS"}) |
|||
|
|||
#for c in imap(classToJSON, allCourses()): |
|||
#try: |
|||
#print indexListing(c) |
|||
#except UnIndexable as e: |
@ -0,0 +1,24 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
class UnIndexable(Exception): |
|||
def __init__(self, course): |
|||
self.course = course |
|||
|
|||
@property |
|||
def reason(self): |
|||
course = self.course |
|||
if not course["code"] and not course["title"]: |
|||
message = "there was no course code and no title defined" |
|||
if not course["code"]: |
|||
message = "there was no course code defined" |
|||
if not course["title"]: |
|||
message = "there was no course title defined" |
|||
if not course["sections"]: |
|||
message = "there were no sections defined" |
|||
return """ |
|||
There was a problem with indexing this course. |
|||
%s |
|||
There could be several reasons why, my best guess is that %s |
|||
We need at least the course code, title, and one or more sections to index |
|||
|
|||
""" % (course, message) |
@ -0,0 +1,97 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from json import loads, load |
|||
from re import sub, split |
|||
from itertools import groupby |
|||
from numpy import mean |
|||
from operator import attrgetter |
|||
|
|||
import pygal |
|||
import csv |
|||
|
|||
class Textbook(object): |
|||
def __init__(self, dept, code, title, author, price): |
|||
self.dept = dept |
|||
self.code = code |
|||
self.title = title |
|||
self.author = author |
|||
self.price = float(price) |
|||
|
|||
def __repr__(self): |
|||
return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, |
|||
self.code, |
|||
self.title, |
|||
self.author, |
|||
self.price) |
|||
|
|||
|
|||
def courses(): |
|||
with open("./books.csv", "r") as books: |
|||
booksreader = csv.reader(books) |
|||
for row in booksreader: |
|||
yield row |
|||
|
|||
|
|||
def groupDept(courselist): |
|||
sortedCourses = sorted(courselist, key=attrgetter("dept")) |
|||
for course in groupby(sortedCourses, attrgetter("dept")): |
|||
yield course[0], list(course[1]) |
|||
|
|||
def meanPrice(books): |
|||
return mean([book.price for book in books]) |
|||
|
|||
# Questions, |
|||
# mean cost per department |
|||
# mean cost per faculty |
|||
# mean difference between book store copies and other copies per dept and faculty |
|||
# number of overlapping books per faculty, do eng students benefit from that? |
|||
|
|||
# maybe a survey for students to see how often they buy books from other sources |
|||
# correlate with how much they could be saving? |
|||
|
|||
facultyDesc = { |
|||
"hum" : "Humanities", |
|||
"bus" : "Business", |
|||
"hlth" : "Health Science", |
|||
"eng" : "Engineering", |
|||
"sci" : "Science", |
|||
"socsci" : "Social Sciences", |
|||
"artsci" : "Arts & Sciences", |
|||
"meld" : "MELD" |
|||
} |
|||
|
|||
faculties = load(open("./faculties.json")) |
|||
|
|||
def categorize(dept): |
|||
# faculties |
|||
return facultyDesc.get(faculties.get(dept, False), False) |
|||
|
|||
def byFaculty(): |
|||
for dept, books in groupDept(courses()): |
|||
yield (categorize(dept), dept, books) |
|||
|
|||
def meanFacultyCosts(): |
|||
byfac = list(byFaculty()) |
|||
graph = pygal.Bar() |
|||
graph.title = "Mean textbook cost by faculty" |
|||
sortedFacs = sorted(byfac, key=lambda x: x[0]) |
|||
for fac in groupby(sortedFacs, lambda x: x[0]): |
|||
graph.add(fac[0], meanPrice(list(fac[1])[0][2])) |
|||
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
|||
return graph.render(transpose=True) |
|||
|
|||
def meanCosts(): |
|||
cs = groupDept(courses()) |
|||
graph = pygal.Bar() |
|||
graph.title = "Mean textbook cost by department" |
|||
for c in cs: |
|||
dept, books = c |
|||
graph.add(dept, meanPrice(books)) |
|||
#graph.render_to_file("./test_graph.svg") |
|||
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
|||
return graph.render_table(style=True, transpose=True) |
|||
|
|||
for x in courses(): |
|||
print x |
|||
#print meanCosts() |
|||
#print meanFacultyCosts() |
@ -0,0 +1,148 @@ |
|||
#! /usr/bin/python2 |
|||
from functools import partial |
|||
from couchdb import ResourceConflict |
|||
|
|||
from flask import Flask, render_template, flash, request, send_from_directory |
|||
from flask_bootstrap import Bootstrap |
|||
from flask_appconfig import AppConfig |
|||
from urllib import unquote |
|||
from search import searchTerms |
|||
|
|||
from openlibrary import bookUrls |
|||
|
|||
from archive import searchIA |
|||
from urllib import quote, unquote |
|||
from json import dumps, loads |
|||
|
|||
from werkzeug.contrib.cache import MemcachedCache |
|||
cache = MemcachedCache(['127.0.0.1:11211']) |
|||
|
|||
import os |
|||
|
|||
def predict(fieldtype, term): |
|||
print fieldtype |
|||
print term |
|||
if not term: |
|||
return "[]" |
|||
else: |
|||
try: |
|||
cs = completers[fieldtype](term.lower()) |
|||
except KeyError: |
|||
return "[]" |
|||
if cs: |
|||
return cs |
|||
return "[]" |
|||
|
|||
def predictor(fieldtype): |
|||
def inner(request): |
|||
params = dict(request.args.items()) |
|||
return predict(fieldtype, params["term"]) |
|||
return inner |
|||
|
|||
def cacheit(key, thunk): |
|||
""" |
|||
Tries to find a cached version of ``key'' |
|||
If there is no cached version then it will |
|||
evaluate thunk (which must be a generator) |
|||
and cache that, then return the result |
|||
""" |
|||
cached = cache.get(quote(key)) |
|||
if cached is None: |
|||
result = list(thunk()) |
|||
cache.set(quote(key), result) |
|||
return result |
|||
return cached |
|||
|
|||
def ClassSearch(configfile=None): |
|||
defaults = {"Day", "Building", "Exact Location", "Department"} |
|||
app = Flask(__name__) |
|||
AppConfig(app, configfile) # Flask-Appconfig is not necessary, but |
|||
# highly recommend =) |
|||
# https://github.com/mbr/flask-appconfig |
|||
Bootstrap(app) |
|||
|
|||
app.config["scripts"] = "/home/wes/MGOAL/scripts" |
|||
app.config["styles"] = "/home/wes/MGOAL/styles" |
|||
|
|||
@app.route('/favicon.ico') |
|||
def favicon(): |
|||
return send_from_directory("/srv/http/goal/favicon.ico", |
|||
'favicon.ico', mimetype='image/vnd.microsoft.icon') |
|||
|
|||
|
|||
@app.route("/buildpred", methods=("GET", "POST")) |
|||
def buildpred(): |
|||
return predictbuild(request) |
|||
|
|||
@app.route("/locpred", methods=("GET", "POST")) |
|||
def locpred(): |
|||
return predictloc(request) |
|||
|
|||
@app.route("/daypred", methods=("GET", "POST")) |
|||
def daypred(): |
|||
return predictday(request) |
|||
|
|||
@app.route("/deptpred", methods=("GET", "POST")) |
|||
def deptpred(): |
|||
return predictdept(request) |
|||
|
|||
@app.route("/titlepred", methods=("GET", "POST")) |
|||
def titlepred(): |
|||
return predicttitle(request) |
|||
|
|||
@app.route("/", methods=("GET", "POST")) |
|||
def index(): |
|||
return render_template("search.html") |
|||
|
|||
@app.route("/fc", methods=("GET", "POST")) |
|||
def fc(): |
|||
""" Filter Courses """ |
|||
print "trying to get courses" |
|||
params = dict(request.args.items()) |
|||
for key, val in params.iteritems(): |
|||
if val in defaults: |
|||
del params[key] |
|||
results = searchTerms(params) |
|||
return results |
|||
|
|||
@app.route("/resources", methods=("GET", "POST")) |
|||
def resources(): |
|||
""" Get Resources """ |
|||
notRequired = False |
|||
params = loads(dict(request.args.items())["data"]) |
|||
print params |
|||
author = params["author"] |
|||
title = params["title"] |
|||
|
|||
if ("No Textbooks" in title or |
|||
"No Adoption" in title): |
|||
return dumps("false") |
|||
|
|||
# Cache the result of the open library search |
|||
openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) |
|||
print openlib |
|||
|
|||
# cache the result of an internet archive search |
|||
iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) |
|||
print iarchive |
|||
|
|||
if not (any(openlib) or any(iarchive)): |
|||
# We literally could not find ANYTHING |
|||
return dumps("false") |
|||
|
|||
return dumps({ |
|||
"iarchive" : iarchive, |
|||
"openlib" : openlib |
|||
}) |
|||
|
|||
@app.route("/scripts/<filename>") |
|||
def send_script(filename): |
|||
return send_from_directory(app.config["scripts"], filename) |
|||
|
|||
@app.route("/styles/<filename>") |
|||
def send_style(filename): |
|||
return send_from_directory(app.config["styles"], filename) |
|||
return app |
|||
|
|||
if __name__ == "__main__": |
|||
ClassSearch().run(port=8001, debug=True) |
@ -0,0 +1,24 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
class UnIndexable(Exception): |
|||
def __init__(self, course): |
|||
self.course = course |
|||
|
|||
@property |
|||
def reason(self): |
|||
course = self.course |
|||
if not course["code"] and not course["title"]: |
|||
message = "there was no course code and no title defined" |
|||
if not course["code"]: |
|||
message = "there was no course code defined" |
|||
if not course["title"]: |
|||
message = "there was no course title defined" |
|||
if not course["sections"]: |
|||
message = "there were no sections defined" |
|||
return """ |
|||
There was a problem with indexing this course. |
|||
%s |
|||
There could be several reasons why, my best guess is that %s |
|||
We need at least the course code, title, and one or more sections to index |
|||
|
|||
""" % (course, message) |
@ -0,0 +1,97 @@ |
|||
#! /usr/bin/python2 |
|||
|
|||
from json import loads, load |
|||
from re import sub, split |
|||
from itertools import groupby |
|||
from numpy import mean |
|||
from operator import attrgetter |
|||
|
|||
import pygal |
|||
import csv |
|||
|
|||
class Textbook(object): |
|||
def __init__(self, dept, code, title, author, price): |
|||
self.dept = dept |
|||
self.code = code |
|||
self.title = title |
|||
self.author = author |
|||
self.price = float(price) |
|||
|
|||
def __repr__(self): |
|||
return "Dept = %s, Code = %s, %s by %s, costs $%s" % (self.dept, |
|||
self.code, |
|||
self.title, |
|||
self.author, |
|||
self.price) |
|||
|
|||
|
|||
def courses(): |
|||
with open("./books.csv", "r") as books: |
|||
booksreader = csv.reader(books) |
|||
for row in booksreader: |
|||
yield row |
|||
|
|||
|
|||
def groupDept(courselist): |
|||
sortedCourses = sorted(courselist, key=attrgetter("dept")) |
|||
for course in groupby(sortedCourses, attrgetter("dept")): |
|||
yield course[0], list(course[1]) |
|||
|
|||
def meanPrice(books): |
|||
return mean([book.price for book in books]) |
|||
|
|||
# Questions, |
|||
# mean cost per department |
|||
# mean cost per faculty |
|||
# mean difference between book store copies and other copies per dept and faculty |
|||
# number of overlapping books per faculty, do eng students benefit from that? |
|||
|
|||
# maybe a survey for students to see how often they buy books from other sources |
|||
# correlate with how much they could be saving? |
|||
|
|||
facultyDesc = { |
|||
"hum" : "Humanities", |
|||
"bus" : "Business", |
|||
"hlth" : "Health Science", |
|||
"eng" : "Engineering", |
|||
"sci" : "Science", |
|||
"socsci" : "Social Sciences", |
|||
"artsci" : "Arts & Sciences", |
|||
"meld" : "MELD" |
|||
} |
|||
|
|||
faculties = load(open("./faculties.json")) |
|||
|
|||
def categorize(dept): |
|||
# faculties |
|||
return facultyDesc.get(faculties.get(dept, False), False) |
|||
|
|||
def byFaculty(): |
|||
for dept, books in groupDept(courses()): |
|||
yield (categorize(dept), dept, books) |
|||
|
|||
def meanFacultyCosts(): |
|||
byfac = list(byFaculty()) |
|||
graph = pygal.Bar() |
|||
graph.title = "Mean textbook cost by faculty" |
|||
sortedFacs = sorted(byfac, key=lambda x: x[0]) |
|||
for fac in groupby(sortedFacs, lambda x: x[0]): |
|||
graph.add(fac[0], meanPrice(list(fac[1])[0][2])) |
|||
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
|||
return graph.render(transpose=True) |
|||
|
|||
def meanCosts(): |
|||
cs = groupDept(courses()) |
|||
graph = pygal.Bar() |
|||
graph.title = "Mean textbook cost by department" |
|||
for c in cs: |
|||
dept, books = c |
|||
graph.add(dept, meanPrice(books)) |
|||
#graph.render_to_file("./test_graph.svg") |
|||
graph.value_formatter = lambda x: '$%.2f' % x if x is not None else "None" |
|||
return graph.render_table(style=True, transpose=True) |
|||
|
|||
for x in courses(): |
|||
print x |
|||
#print meanCosts() |
|||
#print meanFacultyCosts() |
@ -0,0 +1,148 @@ |
|||
#! /usr/bin/python2 |
|||
from functools import partial |
|||
from couchdb import ResourceConflict |
|||
|
|||
from flask import Flask, render_template, flash, request, send_from_directory |
|||
from flask_bootstrap import Bootstrap |
|||
from flask_appconfig import AppConfig |
|||
from urllib import unquote |
|||
from search import searchTerms |
|||
|
|||
from openlibrary import bookUrls |
|||
|
|||
from archive import searchIA |
|||
from urllib import quote, unquote |
|||
from json import dumps, loads |
|||
|
|||
from werkzeug.contrib.cache import MemcachedCache |
|||
cache = MemcachedCache(['127.0.0.1:11211']) |
|||
|
|||
import os |
|||
|
|||
def predict(fieldtype, term): |
|||
print fieldtype |
|||
print term |
|||
if not term: |
|||
return "[]" |
|||
else: |
|||
try: |
|||
cs = completers[fieldtype](term.lower()) |
|||
except KeyError: |
|||
return "[]" |
|||
if cs: |
|||
return cs |
|||
return "[]" |
|||
|
|||
def predictor(fieldtype): |
|||
def inner(request): |
|||
params = dict(request.args.items()) |
|||
return predict(fieldtype, params["term"]) |
|||
return inner |
|||
|
|||
def cacheit(key, thunk): |
|||
""" |
|||
Tries to find a cached version of ``key'' |
|||
If there is no cached version then it will |
|||
evaluate thunk (which must be a generator) |
|||
and cache that, then return the result |
|||
""" |
|||
cached = cache.get(quote(key)) |
|||
if cached is None: |
|||
result = list(thunk()) |
|||
cache.set(quote(key), result) |
|||
return result |
|||
return cached |
|||
|
|||
def ClassSearch(configfile=None): |
|||
defaults = {"Day", "Building", "Exact Location", "Department"} |
|||
app = Flask(__name__) |
|||
AppConfig(app, configfile) # Flask-Appconfig is not necessary, but |
|||
# highly recommend =) |
|||
# https://github.com/mbr/flask-appconfig |
|||
Bootstrap(app) |
|||
|
|||
app.config["scripts"] = "/home/wes/MGOAL/scripts" |
|||
app.config["styles"] = "/home/wes/MGOAL/styles" |
|||
|
|||
@app.route('/favicon.ico') |
|||
def favicon(): |
|||
return send_from_directory("/srv/http/goal/favicon.ico", |
|||
'favicon.ico', mimetype='image/vnd.microsoft.icon') |
|||
|
|||
|
|||
@app.route("/buildpred", methods=("GET", "POST")) |
|||
def buildpred(): |
|||
return predictbuild(request) |
|||
|
|||
@app.route("/locpred", methods=("GET", "POST")) |
|||
def locpred(): |
|||
return predictloc(request) |
|||
|
|||
@app.route("/daypred", methods=("GET", "POST")) |
|||
def daypred(): |
|||
return predictday(request) |
|||
|
|||
@app.route("/deptpred", methods=("GET", "POST")) |
|||
def deptpred(): |
|||
return predictdept(request) |
|||
|
|||
@app.route("/titlepred", methods=("GET", "POST")) |
|||
def titlepred(): |
|||
return predicttitle(request) |
|||
|
|||
@app.route("/", methods=("GET", "POST")) |
|||
def index(): |
|||
return render_template("search.html") |
|||
|
|||
@app.route("/fc", methods=("GET", "POST")) |
|||
def fc(): |
|||
""" Filter Courses """ |
|||
print "trying to get courses" |
|||
params = dict(request.args.items()) |
|||
for key, val in params.iteritems(): |
|||
if val in defaults: |
|||
del params[key] |
|||
results = searchTerms(params) |
|||
return results |
|||
|
|||
@app.route("/resources", methods=("GET", "POST")) |
|||
def resources(): |
|||
""" Get Resources """ |
|||
notRequired = False |
|||
params = loads(dict(request.args.items())["data"]) |
|||
print params |
|||
author = params["author"] |
|||
title = params["title"] |
|||
|
|||
if ("No Textbooks" in title or |
|||
"No Adoption" in title): |
|||
return dumps("false") |
|||
|
|||
# Cache the result of the open library search |
|||
openlib = cacheit("openlib"+title+author, lambda : bookUrls(title, author)) |
|||
print openlib |
|||
|
|||
# cache the result of an internet archive search |
|||
iarchive = cacheit("iarchive"+title+author, lambda : searchIA(title, author)) |
|||
print iarchive |
|||
|
|||
if not (any(openlib) or any(iarchive)): |
|||
# We literally could not find ANYTHING |
|||
return dumps("false") |
|||
|
|||
return dumps({ |
|||
"iarchive" : iarchive, |
|||
"openlib" : openlib |
|||
}) |
|||
|
|||
@app.route("/scripts/<filename>") |
|||
def send_script(filename): |
|||
return send_from_directory(app.config["scripts"], filename) |
|||
|
|||
@app.route("/styles/<filename>") |
|||
def send_style(filename): |
|||
return send_from_directory(app.config["styles"], filename) |
|||
return app |
|||
|
|||
if __name__ == "__main__": |
|||
ClassSearch().run(port=8001, debug=True) |
Loading…
Reference in new issue