TextbookEngine/crawler/classes.py


								#! /usr/bin/env python2


								from mapping import indexCourse

								from sys import argv

								from itertools import chain, islice

								from re import search, sub

								from functools import total_ordering


								from books import textbookInfo


								import datetime as dt

								import lxml.html as lxh

								import requests

								import logging

								import sys

								import copy


								# threading imports

								import Queue as q

								import threading as thd


								# Codes for semesters

								# The first three digits of the year, followed by the month the semester starts

								fall = "2179"

								spring_summer = "2175"

								winter = "2181"


								baseurl = "https://applicants.mcmaster.ca/psp/prepprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL"


								searchurl = "https://csprd.mcmaster.ca/psc/prcsprd/EMPLOYEE/PSFT_LS/c/COMMUNITY_ACCESS.CLASS_SEARCH.GBL"


								custom_headers = {

								        "User-Agent" : "Mozilla/5.0 (X11; Linux x86_64; rv:41.0) Gecko/20100101 Firefox/41.0",

								        "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",

								        }


								# format strings to build GET requests, taken from an actual browser session

								courseCodes1 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_SUBJ_SRCH%240&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=5tq9x%2Fjt42mf62Sh5z%2BrjxT0gT15kiIyQ2cecCSmRB4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}"


								courseCodes2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=SSR_CLSRCH_WRK2_SSR_ALPHANUM_{1}&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=vIUgl6ZXw045S07EPbQw4RDzv7NmKCDdJFdT4CTRQNM%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={2}"


								payload2 = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=%23ICSave&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&CLASS_SRCH_WRK2_STRM$45$={1}"


								payload = "ICAJAX=1&ICNAVTYPEDROPDOWN=1&ICType=Panel&ICElementNum=0&ICStateNum={0}&ICAction=CLASS_SRCH_WRK2_SSR_PB_CLASS_SRCH&ICXPos=0&ICYPos=0&ResponsetoDiffFrame=-1&TargetFrameName=None&FacetPath=None&ICFocus=&ICSaveWarningFilter=0&ICChanged=-1&ICResubmit=0&ICSID=aWx3w6lJ6d2wZui6hwRVSEnzsPgCA3afYJEFBLLkxe4%3D&ICActionPrompt=false&ICFind=&ICAddCount=&ICAPPCLSDATA=&SSR_CLSRCH_WRK_SUBJECT$75$$0={1}&CLASS_SRCH_WRK2_STRM$45$={2}"


								days = {

								        "Mo" : 0,

								        "Tu" : 1,

								        "We" : 2,

								        "Th" : 3,

								        "Fr" : 4,

								        "Sa" : 5,

								        "Su" : 6

								        }


								def parse_semester(sem):

								    """

								    Take a semester and try to parse it into the numeral format

								    """

								    try:

								        splitted = sem.split("/")

								        year = splitted[0]

								        month = splitted[1]

								        return "%s%s%s" % (year[0], year[2:4], month[1])

								    except IndexError:

								        return sem


								def timeparse(time):

								    """

								    Parse the time into numbers

								    """

								    if len(time) == 7:

								        hour = int(time[0:2])

								        minutes = int(time[3:5])

								        half = time[5:7]

								    else:

								        hour = int(time[0])

								        minutes = int(time[2:4])

								        half = time[4:6]

								    if half == "PM":

								        if hour < 12:

								            hour = hour + 12


								    return (str(hour), str(minutes), half)


								class Class(object):

								    def __init__(self, dept, title, sections):

								        self.title = title.encode("UTF-8")

								        self.sections = sections

								        self.dept = dept


								    def __repr__(self):

								        return repr((self.title, self.sections))


								    def __iter__(self):

								        return iter((self.title, sec) for sec in self.sections)


								    def hasCode(self):

								        """

								        Heuristic for checking if a course has a code associated with it

								        Checks if it has more than two words and if they start with uppercase letters

								        """

								        splitted = self.title.strip().split(" ")

								        return ((len(splitted) >= 2) and

								                (splitted[0].upper() == splitted[0]) and

								                (splitted[1].upper() == splitted[1]))


								    @property

								    def code(self):

								        if self.hasCode():

								            return self.title.strip().split(" ")[1].strip()

								        return False


								    @property

								    def books(self):

								        """

								        Get textbooks for the course

								        """

								        if self.dept and self.code:

								            return textbookInfo(self.dept, self.code, withPrices=True)

								        return False


								@total_ordering

								class Section(dict):

								    """

								    This represents a section of a course

								    """

								    def __init__(self, time, loc, prof, sem):

								        self.time = time


								        # Location of the course (building)

								        self.loc = loc


								        self.prof = prof

								        self._sem = sem

								        self._date = False

								        self._day = False


								    @property

								    def sem(self):

								        """

								        Return the semester the course runs

								        """

								        parsed = parse_semester(self._sem)

								        if parsed == fall:

								            return "Fall"

								        elif parsed == winter:

								            return "Winter"

								        else:

								            return "Spring/Summer"


								    @property

								    def date(self):

								        """

								        Return the day(s) of the week the section runs and the start and end times

								        """

								        if self.time != "TBA":

								            day, start, _, end = self.time.split()


								            # Assuming that each day is two characters, create a list of them

								            day = [day[n:n+2] for n in range(0, len(day)-1, 2)]


								            self._date = (day, timeparse(start), timeparse(end))


								            return self._date

								        return self.time


								    @property

								    def day(self):

								        """

								        Return just the day(s) the section runs

								        """


								        # This is set when the section is duplicated (then it would have a single day)

								        if self._day:

								            return self._day


								        # Otherwise return the list of days from the date property

								        if self.date != "TBA":

								            return self.date[0]

								        return "TBA"


								    @property

								    def start(self):

								        """

								        Return the starting time of this section

								        """

								        if self.date != "TBA":

								            return self.date[1][0] + self.date[1][1]

								        return "TBA"


								    def __repr__(self):

								        return ("""

								                Time = %s, Location = %s, Instructor = %s, Semester Running = %s

								                 """ % (self.date, self.loc, self.prof, self.sem))

								    def __gt__(self, x):

								        if isinstance(self.day, list):

								            raise NotImplementedError


								        if (self.date == "TBA" or

								            x.date == "TBA"):

								            return False


								        return ((days[self.day] > days[x.day]) or

								                ((self.day == x.day) and

								                 (self.start > x.start)))


								    def __eq__(self, x):

								        return (x.date == self.date and

								                x.prof == self.prof and

								                x.loc == self.loc and

								                x.sem == self.sem)


								def getStateNum(html):

								    """

								    Get the state num from Mosaic

								    This is unique to each requester

								    """

								    parsed = lxh.fromstring(html)

								    return parsed.xpath(".//input[@name=\"ICStateNum\"]")[0].value


								def parseSection(section):

								    cols = section.xpath(".//td")

								    assert len(cols) == 4

								    time, loc, prof, sem = [col.text_content().encode("UTF-8").strip() for col in cols]


								    classinfo = Section(time, loc, prof, sem)

								    return classinfo


								def getSectionInfo(table):

								    trs = table.xpath(".//tr")

								    for tr in trs:

								        if tr.xpath("@id") and search(r"SSR_CLSRCH", tr.xpath("@id")[0]):

								            yield parseSection(tr)


								def parseColumns(subject, html):

								    parsed = lxh.fromstring(html)


								    classInfo = (list(getSectionInfo(table)) for table in

								                  islice((table for table in parsed.xpath(".//table")

								                    if table.xpath("@id") and

								                    search(r"ICField[0-9]+\$scroll", table.xpath("@id")[0])), 1, sys.maxsize))


								    classNames = ((subject, span.text_content().strip()) for span in parsed.xpath(".//span")

								                    if span.xpath("@id") and

								                       search(r"DERIVED_CLSRCH_DESCR", span.xpath("@id")[0]))


								    return list(zip(classNames, classInfo))


								def getCodes(html):

								    parsed = lxh.fromstring(html)


								    return (code.text_content().encode("UTF-8") for code in

								                parsed.xpath("//span")

								                if code.xpath("@id") and

								                   search(r"SSR_CLSRCH_SUBJ_SUBJECT\$[0-9]+", code.xpath("@id")[0]))


								class MosReq(object):

								    def __init__(self, semester):

								        self.semester = semester

								        s = requests.Session()

								        resp = s.get(baseurl, allow_redirects=True, headers=custom_headers).content


								        # Let the server set some cookies before doing the searching

								        cookies = {}

								        for key, val in s.cookies.items():

								            cookies[key] = val

								        self.cookies = cookies

								        self.statenum = False

								        self.codes_ = []


								    def getlist(self, subject):

								        sys.stderr.write("Getting %s\n" % subject.decode("UTF-8"))

								        first_req = requests.get(searchurl, cookies=self.cookies)

								        # for some reason Mosaic wants us to request it twice, ??????????????????

								        self.statenum = getStateNum(first_req.content)

								        first_req = requests.post(searchurl,

								                                  data=payload.format(self.statenum, subject, self.semester),

								                                  cookies=self.cookies,

								                                  allow_redirects=False,

								                                  headers=custom_headers)


								        # we make a first request to get the ICStateNum in case it thinks there are too many results

								        try:

								            self.statenum = getStateNum(first_req.content)

								        except IndexError:

								            pass

								        if b"Your search will return over" in first_req:


								            return requests.post(searchurl,

								                                 data=payload2.format(self.statenum, self.semester),

								                                 cookies=self.cookies,

								                                 allow_redirects=False,

								                                 headers=custom_headers).content

								        else:

								            return first_req.content


								    def classes(self, subject):

								        return list(parseColumns(subject, self.getlist(subject)))


								    def getCodes(self, letter):

								        sys.stderr.write("Getting letter " + letter + "\n")

								        first_req = requests.get(searchurl, cookies=self.cookies).content

								        self.statenum = getStateNum(first_req)


								        self.statenum = getStateNum(requests.post(searchurl,

								                                    data=courseCodes1.format(self.statenum, self.semester),

								                                    cookies=self.cookies,

								                                    headers=custom_headers).content)


								        return getCodes(requests.post(searchurl,

								                             data=courseCodes2.format(self.statenum, letter, self.semester),

								                             cookies=self.cookies,

								                             allow_redirects=False,

								                             headers=custom_headers).content)

								    @property

								    def codes(self):

								        """

								        Gets a list of all course codes available

								        """

								        if not self.codes_:

								            self.codes_ = list(

								                            chain.from_iterable(

								                                self.getCodes(chr(l)) for l in range(65, 91)))

								        return self.codes_


								def request(codes, lists, semester):

								    requester = MosReq(semester)

								    while not codes.empty():

								        code = codes.get()

								        lists.put(requester.classes(code))

								        codes.task_done()


								class CourseInfo(object):

								    def __init__(self, threadcount, semester):

								        self._codes = False

								        self.threadcount = threadcount

								        self.semester = semester


								    @property

								    def codes(self):

								        if not self._codes:

								            req = MosReq(self.semester)

								            self._codes = req.codes

								        return self._codes


								    def classes(self):

								        """

								        Returns a generator of all courses and textbooks

								        """


								        # Queue of letters to process

								        course_codes = q.Queue()


								        # Initialize the queue with all codes

								        for code in self.codes:

								            course_codes.put(code)


								        lists = q.Queue()

								        threads = []

								        thread = None


								        # Spawn threads that pull from the queue of course codes

								        for i in range(self.threadcount):

								            thread = thd.Thread(group=None, target=request, args=(course_codes, lists, self.semester))

								            threads.append(thread)

								            thread.start()


								        # Block until all queue tasks are done

								        course_codes.join()


								        # Block until all threads have exited

								        for t in threads:

								            t.join()


								        sections = []


								        # Empty the queue of sections and put it into the list

								        while not lists.empty():

								            sections.append(lists.get())


								        # This creates a section for each day, so that each section has only one day

								        for cl in chain.from_iterable(sections):

								            new_sections = []

								            for sec in cl[1]:

								                # sec.day is a list of days

								                # if there is more than one day, we want to split this up into multiple sections

								                if len(sec.day) > 1:

								                    for day in sec.day:

								                        new_sections.append(copy.deepcopy(sec))

								                        new_sections[-1]._day = day

								                else:

								                    sec._day = sec.day[0]

								                    new_sections.append(sec)


								            # cl[0][0] is the subject code/department scraped from the page

								            # cl[0][1] is the subject name scraped from the page

								            # regex substitution is to get rid of erroneous characters (due to an encoding problem with the page)


								            yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections))


								def getCourses(semester, threadcount=10):

								    """

								    Gets all the courses for a given semester

								    """

								    return CourseInfo(threadcount, semester).classes()


								def allCourses():

								    """

								    Gets all the courses for all three semesters

								    """

								    courses = map(getCourses, [spring_summer, fall, winter])

								    return chain.from_iterable(courses)


								if __name__ == "__main__":

								    for course in allCourses():

								        print course

								        #indexCourse(course)