#! /usr/bin/python2
import elasticsearch
from elasticsearch_dsl import FacetedSearch, Search, Q
from elasticsearch_dsl.aggs import Terms, DateHistogram
from sys import exit, stderr
from json import dumps, loads
from itertools import chain, imap
from hashlib import sha1
from textbookExceptions import UnIndexable
from mcmaster.classes import allCourses
# Generic instance of elasticsearch right now
es = elasticsearch.Elasticsearch()
def summarize(text):
splitted = text.split(" ")
if len(splitted) > 4:
return " ".join(splitted[0:4]) + ".."
return text
def sectionToJSON(section):
return {
"prof" :,
"sem" : section.sem,
"day" :
def classToJSON(clss):
return {
"title" : clss.title,
"sections" : map(sectionToJSON, clss.sections),
"dept" : clss.dept,
"code" : clss.code,
"books" : list(clss.books) if clss.books else []
def truncate(docid):
Truncate a document id to 12 digits
The document ID should be based on a
hash of unique identifiers
return int(str(docid)[0:12])
def hashsec(course):
Hash a course into a usable id
if not course["code"]:
code = ""
code = course["code"]
if not course["title"]:
title = ""
title = course["title"]
if not course["sections"] or len(course["sections"]) < 1:
course["sections"][0] = ""
if not (code or title):
raise UnIndexable(course)
h = sha1()
h.update(code + title + course["sections"][0]["sem"])
return int(h.hexdigest(), 16)
def createIndex(name):
This creates a new index in elasticsearch
An index is like a schema in a regular database
Create an elasticsearch index
indices = elasticsearch.client.IndicesClient(es)
print indices.create(name)
with open("./course.json", "r") as mapping:
print indices.put_mapping("course", loads(, name)
def indexListing(course):
Index a specific course in the database (using the courses index)
'books': [],
'dept': 'COLLAB',
'code': '2C03',
'sections': [
'prof': 'Lisa Pender',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Mo'
'prof': 'Staff',
'sem': '2015/09/08 - 2015/12/08',
'day': 'Th'
'title': 'COLLAB 2C03 - Sociology I'
courseID = hashsec(course)
print es.index(index="oersearch",
# For every course we index, we also create a resource for it
# This should be an idempotent operation because we're putting it in couchdb
# And we're using the id obtained from the hash function, so it should just update the document
# no need to delete anything
#courseDept = course[0]["title"].strip().split(" ")[0].strip()
#courseCode = course[0]["title"].strip().split(" ")[1].strip()
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)
#print "Couldn't create the resource associated with %s" % course
def termSearch(field):
Make a term search (exact match)
def t(term):
q = Q("term",
"sections."+field : term
return q
return t
def search(field):
Make a match search
def s(term):
q = Q("match",
field : term
return q
return s
def join(x, y):
Join two queries
return x & y
def filterSections(secs):
Get rid of tutorial sections
because they almost always have "Staff" as the instructor
This is just a heuristic of course
filtered = [s for s in secs.sections if "Staff" not in]
if len(filtered) > 0:
return filtered
return False
def searchTerms(terms):
Run a search for courses
# A list of all the queries we want to run
qs = [searchers[field](term) for
field, term in
terms.iteritems() if
term and searchers.has_key(field)]
if not qs:
# No queries = no results
return dumps([])
# Reduce joins all of the queries into one query
# It will search for the conjunction of all of them
# So that means it cares about each query equally
q = reduce(join, qs)
s = (Search(using=es, index="oersearch")
.query(q))[0:100] # only return up to 100 results for now
results = s.execute()
filtered = [
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials
for secs in results
if filterSections(secs)
results = []
for obj, secs in filtered:
# Add the truncated course id
# This is used to point to the resource page for that course
secs["id"] = truncate(
secs["title"] = obj.title
if obj["dept"] not in secs["title"]:
secs["dept"] = obj.dept
if obj.books:
secs["books"] = [
"booktitle" : summarize(book[0].encode("ASCII")),
"bookauthor" : book[1].encode("ASCII"),
"bookprice" : book[2].encode("ASCII")
for book in obj.books
secs["books"] = ""
return dumps(results)
searchers = {
"title" : search("title"),
"loc" : search("loc"),
"time" : search("time"),
"prof" : search("prof"),
"day" : search("day"),
#print searchTerms({"title" : "PHILOS"})
#for c in imap(classToJSON, allCourses()):
#print indexListing(c)
#except UnIndexable as e: