TextbookEngine/src/search.py


								#! /usr/bin/python2


								import elasticsearch


								from elasticsearch_dsl import FacetedSearch, Search, Q

								from elasticsearch_dsl.aggs import Terms, DateHistogram

								from sys import exit, stderr

								from json import dumps, loads

								from itertools import chain, imap


								from hashlib import sha1


								from textbookExceptions import UnIndexable


								# Generic instance of elasticsearch right now

								es = elasticsearch.Elasticsearch()


								def summarize(text):

								    splitted = text.split(" ")

								    if len(splitted) > 4:

								        return " ".join(splitted[0:4]) + ".."

								    return text


								def sectionToJSON(section):

								    return {

								            "prof" : section.prof,

								            "sem"  : section.sem,

								            "day"  : section.day

								            }


								def classToJSON(clss):

								    return {

								            "title"    : clss.title,

								            "sections" : map(sectionToJSON, clss.sections),

								            "dept"     : clss.dept,

								            "code"     : clss.code,

								            "books"    : list(clss.books) if clss.books else []

								            }


								def truncate(docid):

								    """

								    Truncate a document id to 12 digits

								    The document ID should be based on a

								    hash of unique identifiers

								    """

								    return int(str(docid)[0:12])


								def hashsec(course):

								    """

								    Hash a course into a usable id

								    """

								    if not course["code"]:

								        code = ""

								    else:

								        code = course["code"]

								    if not course["title"]:

								        title = ""

								    else:

								        title = course["title"]


								    if not course["sections"] or len(course["sections"]) < 1:

								        course["sections"][0] = ""


								    if not (code or title):

								        raise UnIndexable(course)


								    h = sha1()

								    h.update(code + title + course["sections"][0]["sem"])

								    return int(h.hexdigest(), 16)


								def createIndex(name):

								    """

								    This creates a new index in elasticsearch

								    An index is like a schema in a regular database

								    Create an elasticsearch index


								    """

								    indices = elasticsearch.client.IndicesClient(es)


								    print indices.create(name)

								    with open("../course.json", "r") as mapping:

								        print indices.put_mapping("course", loads(mapping.read()), name)


								def indexListing(course):

								    """

								    Index a specific course in the database (using the courses index)

								    example,

								    {

								     'books': [],

								     'dept': 'COLLAB',

								     'code': '2C03',

								     'sections': [

								                    {

								                     'prof': 'Lisa Pender',

								                     'sem': '2015/09/08 - 2015/12/08',

								                     'day': 'Mo'

								                     },

								                     {

								                      'prof': 'Staff',

								                      'sem': '2015/09/08 - 2015/12/08',

								                      'day': 'Th'

								                      }

								                  ],

								     'title': 'COLLAB 2C03 - Sociology I'

								     }


								    """

								    json_course = classToJSON(course)

								    courseID = hashsec(json_course)

								    print es.index(index="oersearch",

								            doc_type="course",

								            id=courseID,

								            body=json_course)


								    # For every course we index, we also create a resource for it

								    # This should be an idempotent operation because we're putting it in couchdb

								    # And we're using the id obtained from the hash function, so it should just update the document

								    # no need to delete anything

								    #try:

								        #courseDept = course[0]["title"].strip().split(" ")[0].strip()

								        #courseCode = course[0]["title"].strip().split(" ")[1].strip()

								        #print "DEPARTMENT = \"%s\", COURSECODE = \"%s\"" % (courseDept, courseCode)

								        #print createResource(textbookInfo, course[0], courseDept, courseCode, courseID)

								    #except:

								        #print "Couldn't create the resource associated with %s" % course


								def termSearch(field):

								    """

								    Make a term search (exact match)

								    """

								    def t(term):

								        q = Q("term",

								                **{

								                    "sections."+field : term

								                    })

								        return q

								    return t


								def search(field):

								    """

								    Make a match search

								    """

								    def s(term):

								        q = Q("match",

								                 **{

								                     field : term

								                    })

								        return q

								    return s


								def join(x, y):

								    """

								    Join two queries

								    """

								    return x & y


								def filterSections(secs):

								    """

								    Get rid of tutorial sections

								    because they almost always have "Staff" as the instructor

								    This is just a heuristic of course

								    """

								    filtered = [s for s in secs.sections if "Staff" not in s.prof]

								    if len(filtered) > 0:

								        return filtered

								    return False


								def searchTerms(terms):

								    """

								    Run a search for courses

								    """


								    # A list of all the queries we want to run

								    qs = [searchers[field](term) for

								            field, term in

								            terms.iteritems() if

								                term and searchers.has_key(field)]


								    if not qs:

								        # No queries = no results

								        return dumps([])


								    # Reduce joins all of the queries into one query

								    # It will search for the conjunction of all of them

								    # So that means it cares about each query equally

								    q = reduce(join, qs)


								    s = (Search(using=es, index="oersearch")

								        .query(q))[0:100] # only return up to 100 results for now


								    results = s.execute()


								    filtered = [

								                 (secs, filterSections(secs)[0].to_dict()) # get rid of tutorials

								                  for secs in results

								                    if filterSections(secs)

								               ]

								    results = []

								    for obj, secs in filtered:

								        # Add the truncated course id

								        # This is used to point to the resource page for that course

								        secs["id"] = truncate(obj.meta.id)

								        secs["title"] = obj.title

								        if obj["dept"] not in secs["title"]:

								            secs["dept"] = obj.dept

								        if obj.books:

								            secs["books"] = [

								                             {

								                               "booktitle"  : summarize(book[0].encode("ASCII")),

								                               "bookauthor" : book[1].encode("ASCII"),

								                               "bookprice"  : book[2].encode("ASCII")

								                             }

								                                for book in obj.books

								                            ]

								        else:

								            secs["books"] = ""

								        results.append(secs)


								    return dumps(results)


								searchers = {

								    "title" : search("title"),

								    "loc"   : search("loc"),

								    "time"  : search("time"),

								    "prof"  : search("prof"),

								    "day"   : search("day"),

								    }