You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
238 lines
6.3 KiB
238 lines
6.3 KiB
#! /usr/bin/python2
|
|
|
|
import elasticsearch
|
|
|
|
from elasticsearch_dsl import FacetedSearch, Search, Q
|
|
from elasticsearch_dsl.aggs import Terms, DateHistogram
|
|
from sys import exit, stderr
|
|
from json import dumps, loads
|
|
from itertools import chain, imap
|
|
|
|
from hashlib import sha1
|
|
|
|
from textbookExceptions import UnIndexable
|
|
|
|
from mcmaster.classes import allCourses
|
|
|
|
# Generic instance of elasticsearch right now
|
|
es = elasticsearch.Elasticsearch()
|
|
|
|
def summarize(text):
|
|
splitted = text.split(" ")
|
|
if len(splitted) > 4:
|
|
return " ".join(splitted[0:4]) + ".."
|
|
return text
|
|
|
|
def sectionToJSON(section):
|
|
return {
|
|
"prof" : section.prof,
|
|
"sem" : section.sem,
|
|
"day" : section.day
|
|
}
|
|
|
|
def classToJSON(clss):
|
|
return {
|
|
"title" : clss.title,
|
|
"sections" : map(sectionToJSON, clss.sections),
|
|
"dept" : clss.dept,
|
|
"code" : clss.code,
|
|
"books" : list(clss.books) if clss.books else []
|
|
}
|
|
|
|
|
|
def truncate(docid):
|
|
"""
|
|
Truncate a document id to 12 digits
|
|
The document ID should be based on a
|
|
hash of unique identifiers
|
|
"""
|
|
return int(str(docid)[0:12])
|
|
|
|
def hashsec(course):
|
|
"""
|
|
Hash a course into a usable id
|
|
"""
|
|
if not course["code"]:
|
|
code = ""
|
|
else:
|
|
code = course["code"]
|
|
if not course["title"]:
|
|
title = ""
|
|
else:
|
|
title = course["title"]
|
|
|
|
if not course["sections"] or len(course["sections"]) < 1:
|
|
course["sections"][0] = ""
|
|
|
|
if not (code or title):
|
|
raise UnIndexable(course)
|
|
|
|
h = sha1()
|
|
h.update(code + title + course["sections"][0]["sem"])
|
|
return int(h.hexdigest(), 16)
|
|
|
|
def createIndex(name):
|
|
"""
|
|
This creates a new index in elasticsearch
|
|
An index is like a schema in a regular database
|
|
Create an elasticsearch index
|
|
|
|
"""
|
|
indices = elasticsearch.client.IndicesClient(es)
|
|
|
|
print indices.create(name)
|
|
with open("../course.json", "r") as mapping:
|
|
print indices.put_mapping("course", loads(mapping.read()), name)
|
|
|
|
def indexListing(course):
|
|
"""
|
|
Index a specific course in the database (using the courses index)
|
|
example,
|
|
{
|
|
'books': [],
|
|
'dept': 'COLLAB',
|
|
'code': '2C03',
|
|
'sections': [
|
|
{
|
|
'prof': 'Lisa Pender',
|
|
'sem': '2015/09/08 - 2015/12/08',
|
|
'day': 'Mo'
|
|
},
|
|
{
|
|
'prof': 'Staff',
|
|
'sem': '2015/09/08 - 2015/12/08',
|
|
'day': 'Th'
|
|
}
|
|
],
|
|
'title': 'COLLAB 2C03 - Sociology I'
|
|
}
|
|
|
|
"""
|
|
courseID = hashsec(course)
|
|
print es.index(index="oersearch",
|
|
doc_type="course",
|
|
id=courseID,
|
|
body=course)
|
|
|
|
# For every course we index, we also create a resource for it
|
|
# This should be an idempotent operation because we're putting it in couchdb
|
|
# And we're using the id obtained from the hash function, so it should just update the document
|
|
# no need to delete anything
|
|
#try:
|
|
#courseDept = course[0]["title"].strip().split(" ")[0].strip()
|
|
#courseCode = course[0]["title"].strip().split(" ")[1].strip()
|
|
#print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)
|
|
#print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)
|
|
#except:
|
|
#print "Couldn't create the resource associated with %s" % course
|
|
|
|
def termSearch(field):
|
|
"""
|
|
Make a term search (exact match)
|
|
"""
|
|
def t(term):
|
|
q = Q("term",
|
|
**{
|
|
"sections."+field : term
|
|
})
|
|
return q
|
|
return t
|
|
|
|
def search(field):
|
|
"""
|
|
Make a match search
|
|
"""
|
|
def s(term):
|
|
q = Q("match",
|
|
**{
|
|
field : term
|
|
})
|
|
return q
|
|
return s
|
|
|
|
def join(x, y):
|
|
"""
|
|
Join two queries
|
|
"""
|
|
return x & y
|
|
|
|
def filterSections(secs):
|
|
"""
|
|
Get rid of tutorial sections
|
|
because they almost always have "Staff" as the instructor
|
|
This is just a heuristic of course
|
|
"""
|
|
filtered = [s for s in secs.sections if "Staff" not in s.prof]
|
|
if len(filtered) > 0:
|
|
return filtered
|
|
return False
|
|
|
|
def searchTerms(terms):
|
|
"""
|
|
Run a search for courses
|
|
"""
|
|
|
|
# A list of all the queries we want to run
|
|
qs = [searchers[field](term) for
|
|
field, term in
|
|
terms.iteritems() if
|
|
term and searchers.has_key(field)]
|
|
|
|
if not qs:
|
|
# No queries = no results
|
|
return dumps([])
|
|
|
|
# Reduce joins all of the queries into one query
|
|
# It will search for the conjunction of all of them
|
|
# So that means it cares about each query equally
|
|
q = reduce(join, qs)
|
|
|
|
s = (Search(using=es, index="oersearch")
|
|
.query(q))[0:100] # only return up to 100 results for now
|
|
|
|
results = s.execute()
|
|
|
|
filtered = [
|
|
(secs, filterSections(secs)[0].to_dict()) # get rid of tutorials
|
|
for secs in results
|
|
if filterSections(secs)
|
|
]
|
|
results = []
|
|
for obj, secs in filtered:
|
|
# Add the truncated course id
|
|
# This is used to point to the resource page for that course
|
|
secs["id"] = truncate(obj.meta.id)
|
|
secs["title"] = obj.title
|
|
if obj["dept"] not in secs["title"]:
|
|
secs["dept"] = obj.dept
|
|
if obj.books:
|
|
secs["books"] = [
|
|
{
|
|
"booktitle" : summarize(book[0].encode("ASCII")),
|
|
"bookauthor" : book[1].encode("ASCII"),
|
|
"bookprice" : book[2].encode("ASCII")
|
|
}
|
|
for book in obj.books
|
|
]
|
|
else:
|
|
secs["books"] = ""
|
|
results.append(secs)
|
|
|
|
return dumps(results)
|
|
|
|
|
|
searchers = {
|
|
"title" : search("title"),
|
|
"loc" : search("loc"),
|
|
"time" : search("time"),
|
|
"prof" : search("prof"),
|
|
"day" : search("day"),
|
|
}
|
|
|
|
#print searchTerms({"title" : "PHILOS"})
|
|
#createIndex("oersearch")
|
|
#for c in imap(classToJSON, allCourses()):
|
|
#try:
|
|
#print indexListing(c)
|
|
#except UnIndexable as e:
|
|
##print e
|
|
|