update crawler

8 years ago · 348057fbf0
15 changed files with 8858 additions and 3868 deletions
--- a/src/mcmaster/init.py
+++ b/src/mcmaster/init.py
--- a/src/mcmaster/classes.py
+++ b/src/mcmaster/classes.py
@ -1,11 +1,12 @@
-#! /usr/bin/python2
+#! /usr/bin/env python2
 from mapping import indexCourse
 from sys import argv
 from itertools import chain, islice
 from re import search, sub
 from functools import total_ordering
-from sylla2 import textbookInfo
+from sylla import textbookInfo
 from collections import MutableMapping
 import datetime as dt
@ -110,7 +111,6 @@ class Class(object):
    @property
    def books(self):
        if self.dept and self.code:
            print "tryna get some textbooks man"
            return textbookInfo(self.dept, self.code, withPrices=True)
        return False
@ -204,7 +204,6 @@ def getSectionInfo(table):
            yield parseSection(tr)
 def parseColumns(subject, html):
    print type(html)
    parsed = lxh.fromstring(html)
    classInfo = (list(getSectionInfo(table)) for table in
@ -288,9 +287,11 @@ class MosReq(object):
    def codes(self):
        if not self.codes_:
            self.codes_ = list(chain.from_iterable(
-                                list(map((lambda l:
+                                list(
-                                    self.getCodes(chr(l))),
+                                    map(
-                                    range(65, 91)))))
+                                        (lambda l:
                                            self.getCodes(chr(l))),
                                        range(65, 91)))))
        return self.codes_
 def request(codes, lists, semester):
@ -299,8 +300,6 @@ def request(codes, lists, semester):
        code = codes.get()
        lists.put(requester.classes(code))
        codes.task_done()
        print "WHUT"
    print "DONE"
 class CourseInfo(object):
    def __init__(self, threadcount, semester):
@ -328,11 +327,8 @@ class CourseInfo(object):
            thread.start()
        qcodes.join()
        for t in threads:
            print t
            t.join()
        print "finished getting courses"
        sections = []
        while not lists.empty():
            sections.append(lists.get())
@ -349,14 +345,15 @@ class CourseInfo(object):
                    new_sections.append(sec)
            yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections))
-def getCourses(semester, threadcount=10):
+def getCourses(semester, threadcount=5):
    return CourseInfo(threadcount, semester).classes()
 def allCourses():
    return chain.from_iterable(
-     (getCourses(sem, threadcount=25)
+     (getCourses(sem, threadcount=5)
-        for sem in (fall, winter, spring_summer)))
+        for sem in [spring_summer, fall, winter]))
 if __name__ == "__main__":
    for course in allCourses():
-        sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))
+        indexCourse(course)
        #sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))
--- a/crawler/mapping.py
+++ b/crawler/mapping.py
@ -0,0 +1,73 @@
 #! /usr/bin/env python2
 from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
    analyzer, InnerObjectWrapper, Completion, Keyword, Text, Object
 from elasticsearch_dsl.connections import connections
 connections.create_connection(hosts=["localhost"])
 class TextBook(InnerObjectWrapper):
    pass
 class Section(InnerObjectWrapper):
    pass
 class Course(DocType):
    title = Text()
    dept = Text()
    code = Keyword()
    books = Object(
        doc_class=TextBook,
        properties = {
            "author" : Text(),
            "title" : Text(),
            "price" : Text()
        }
    )
    sections = Object(
            doc_class=Section,
            properties = {
                "sem" : Keyword(),
                "prof" : Text(),
                "loc" : Text(),
                "time" : Text(),
                "day" : Text()
                }
            )
    class Meta:
        index = "course_test"
 def toSection(section):
    return {
             "sem" : section.sem,
             "prof" : section.prof,
             "loc" : section.loc,
             "time" : section.time,
             "day" : section.day
           }
 def toBook(book):
    title, author, price = book
    return {
            "title"  : title,
            "author" : author,
            "price"  : price
            }
 def indexCourse(course):
    print "Trying to index course %s" % course
    new_course = Course(sections=map(toSection, course.sections),
                        books=map(toBook, course.books),
                        title=course.title,
                        dept=course.dept,
                        code=course.code)
    new_course.save()
 #if __name__ == "__main__":
    #Course.init()
--- a/crawler/mcmaster.rkt
+++ b/crawler/mcmaster.rkt
@ -0,0 +1,6 @@
 #! /usr/bin/env racket
 #lang racket
 (require net/url)
--- a/crawler/output
+++ b/crawler/output
--- a/crawler/requirements.txt
+++ b/crawler/requirements.txt
@ -0,0 +1,22 @@
 appdirs==1.4.0
 click==6.7
 dominate==2.3.1
 elasticsearch==5.1.0
 elasticsearch-dsl==5.1.0
 Flask==0.12
 flask-appconfig==0.11.1
 Flask-Bootstrap==3.3.7.1
 itsdangerous==0.24
 Jinja2==2.9.4
 lxml==3.7.2
 MarkupSafe==0.23
 packaging==16.8
 pyparsing==2.1.10
 python-dateutil==2.6.0
 python-memcached==1.58
 requests==2.13.0
 six==1.10.0
 urllib3==1.20
 uWSGI==2.0.14
 visitor==0.1.3
 Werkzeug==0.11.15
--- a/src/mcmaster/site.py
+++ b/src/mcmaster/site.py
--- a/src/mcmaster/spreadsheet.py
+++ b/src/mcmaster/spreadsheet.py
@ -1,4 +1,4 @@
-#! /usr/bin/python3
+#! /usr/bin/python2
 from classes import fallCourses
 import csv
--- a/src/mcmaster/sylla.py
+++ b/src/mcmaster/sylla.py
--- a/src/mcmaster/courses.csv
+++ b/src/mcmaster/courses.csv
--- a/src/mcmaster/courses.ods
+++ b/src/mcmaster/courses.ods
--- a/src/mcmaster/fall_course_schedule.ods
+++ b/src/mcmaster/fall_course_schedule.ods
--- a/src/mcmaster/schedule.csv
+++ b/src/mcmaster/schedule.csv
--- a/src/mcmaster/texbooks.csv
+++ b/src/mcmaster/texbooks.csv
--- a/src/mcmaster/textbooks.csv
+++ b/src/mcmaster/textbooks.csv