Find Cheaper University Textbooks
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

229 lines
6.1 KiB

#! /usr/bin/python2
import elasticsearch
from elasticsearch_dsl import FacetedSearch, Search, Q
from elasticsearch_dsl.aggs import Terms, DateHistogram
from sys import exit, stderr
from json import dumps, loads
from itertools import chain, imap
from hashlib import sha1
from textbookExceptions import UnIndexable
# Generic instance of elasticsearch right now
es = elasticsearch.Elasticsearch()
def summarize(text):
splitted = text.split(" ")
if len(splitted) > 4:
return " ".join(splitted[0:4]) + ".."
return text
def sectionToJSON(section):
return {
"prof" : section.prof,
"sem" : section.sem,
"day" : section.day
}
def classToJSON(clss):
return {
"title" : clss.title,
"sections" : map(sectionToJSON, clss.sections),
"dept" : clss.dept,
"code" : clss.code,
"books" : list(clss.books) if clss.books else []
}
def truncate(docid):
"""
Truncate a document id to 12 digits
The document ID should be based on a
hash of unique identifiers
"""
return int(str(docid)[0:12])
def hashsec(course):
"""
Hash a course into a usable id
"""
if not course["code"]:
code = ""
else:
code = course["code"]
if not course["title"]:
title = ""
else:
title = course["title"]
if not course["sections"] or len(course["sections"]) < 1:
course["sections"][0] = ""
if not (code or title):
raise UnIndexable(course)
h = sha1()
h.update(code + title + course["sections"][0]["sem"])
return int(h.hexdigest(), 16)
def createIndex(name):
"""
This creates a new index in elasticsearch
An index is like a schema in a regular database
Create an elasticsearch index
"""
indices = elasticsearch.client.IndicesClient(es)
print indices.create(name)
with open("../course.json", "r") as mapping:
print indices.put_mapping("course", loads(mapping.read()), name)
def indexListing(course):
"""
Index a specific course in the database (using the courses index)
example,
{
'books': [],
'dept': 'COLLAB',
'code': '2C03',
'sections': [
{
'prof': 'Lisa Pender',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Mo'
},
{
'prof': 'Staff',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Th'
}
],
'title': 'COLLAB 2C03 - Sociology I'
}
"""
json_course = classToJSON(course)
courseID = hashsec(json_course)
print es.index(index="oersearch",
doc_type="course",
id=courseID,
body=json_course)
# For every course we index, we also create a resource for it
# This should be an idempotent operation because we're putting it in couchdb
# And we're using the id obtained from the hash function, so it should just update the document
# no need to delete anything
#try:
#courseDept = course[0]["title"].strip().split(" ")[0].strip()
#courseCode = course[0]["title"].strip().split(" ")[1].strip()
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)
#except:
#print "Couldn't create the resource associated with %s" % course
def termSearch(field):
"""
Make a term search (exact match)
"""
def t(term):
q = Q("term",
**{
"sections."+field : term
})
return q
return t
def search(field):
"""
Make a match search
"""
def s(term):
q = Q("match",
**{
field : term
})
return q
return s
def join(x, y):
"""
Join two queries
"""
return x & y
def filterSections(secs):
"""
Get rid of tutorial sections
because they almost always have "Staff" as the instructor
This is just a heuristic of course
"""
filtered = [s for s in secs.sections if "Staff" not in s.prof]
if len(filtered) > 0:
return filtered
return False
def searchTerms(terms):
"""
Run a search for courses
"""
# A list of all the queries we want to run
qs = [searchers[field](term) for
field, term in
terms.iteritems() if
term and searchers.has_key(field)]
if not qs:
# No queries = no results
return dumps([])
# Reduce joins all of the queries into one query
# It will search for the conjunction of all of them
# So that means it cares about each query equally
q = reduce(join, qs)
s = (Search(using=es, index="oersearch")
.query(q))[0:100] # only return up to 100 results for now
results = s.execute()
filtered = [
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials
for secs in results
if filterSections(secs)
]
results = []
for obj, secs in filtered:
# Add the truncated course id
# This is used to point to the resource page for that course
secs["id"] = truncate(obj.meta.id)
secs["title"] = obj.title
if obj["dept"] not in secs["title"]:
secs["dept"] = obj.dept
if obj.books:
secs["books"] = [
{
"booktitle" : summarize(book[0].encode("ASCII")),
"bookauthor" : book[1].encode("ASCII"),
"bookprice" : book[2].encode("ASCII")
}
for book in obj.books
]
else:
secs["books"] = ""
results.append(secs)
return dumps(results)
searchers = {
"title" : search("title"),
"loc" : search("loc"),
"time" : search("time"),
"prof" : search("prof"),
"day" : search("day"),
}