update crawler

8 years ago · 348057fbf0
15 changed files with 8858 additions and 3868 deletions
--- a/src/mcmaster/init.py
+++ b/src/mcmaster/init.py
--- a/src/mcmaster/classes.py
+++ b/src/mcmaster/classes.py
@ -1,11 +1,12 @@
-#! /usr/bin/python2
+#! /usr/bin/env python2

+from mapping import indexCourse
 from sys import argv
 from itertools import chain, islice
 from re import search, sub
 from functools import total_ordering

-from sylla2 import textbookInfo
+from sylla import textbookInfo
 from collections import MutableMapping

 import datetime as dt
@ -110,7 +111,6 @@ class Class(object):
    @property
    def books(self):
        if self.dept and self.code:
-            print "tryna get some textbooks man"
            return textbookInfo(self.dept, self.code, withPrices=True)
        return False

@ -204,7 +204,6 @@ def getSectionInfo(table):
            yield parseSection(tr)

 def parseColumns(subject, html):
-    print type(html)
    parsed = lxh.fromstring(html)

    classInfo = (list(getSectionInfo(table)) for table in
@ -288,9 +287,11 @@ class MosReq(object):
    def codes(self):
        if not self.codes_:
            self.codes_ = list(chain.from_iterable(
-                                list(map((lambda l:
-                                    self.getCodes(chr(l))),
-                                    range(65, 91)))))
+                                list(
+                                    map(
+                                        (lambda l:
+                                            self.getCodes(chr(l))),
+                                        range(65, 91)))))
        return self.codes_

 def request(codes, lists, semester):
@ -299,8 +300,6 @@ def request(codes, lists, semester):
        code = codes.get()
        lists.put(requester.classes(code))
        codes.task_done()
-        print "WHUT"
-    print "DONE"

 class CourseInfo(object):
    def __init__(self, threadcount, semester):
@ -328,11 +327,8 @@ class CourseInfo(object):
            thread.start()
        qcodes.join()
        for t in threads:
-            print t
            t.join()

-        print "finished getting courses"
-
        sections = []
        while not lists.empty():
            sections.append(lists.get())
@ -349,14 +345,15 @@ class CourseInfo(object):
                    new_sections.append(sec)
            yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections))

-def getCourses(semester, threadcount=10):
+def getCourses(semester, threadcount=5):
    return CourseInfo(threadcount, semester).classes()

 def allCourses():
    return chain.from_iterable(
-     (getCourses(sem, threadcount=25)
-        for sem in (fall, winter, spring_summer)))
+     (getCourses(sem, threadcount=5)
+        for sem in [spring_summer, fall, winter]))

 if __name__ == "__main__":
    for course in allCourses():
-        sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))
+        indexCourse(course)
+        #sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))
--- a/crawler/mapping.py
+++ b/crawler/mapping.py
@ -0,0 +1,73 @@
+#! /usr/bin/env python2
+
+from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
+    analyzer, InnerObjectWrapper, Completion, Keyword, Text, Object
+
+from elasticsearch_dsl.connections import connections
+
+connections.create_connection(hosts=["localhost"])
+
+class TextBook(InnerObjectWrapper):
+    pass
+
+class Section(InnerObjectWrapper):
+    pass
+
+class Course(DocType):
+    title = Text()
+    dept = Text()
+    code = Keyword()
+
+    books = Object(
+        doc_class=TextBook,
+        properties = {
+            "author" : Text(),
+            "title" : Text(),
+            "price" : Text()
+        }
+    )
+
+    sections = Object(
+            doc_class=Section,
+            properties = {
+                "sem" : Keyword(),
+                "prof" : Text(),
+                "loc" : Text(),
+                "time" : Text(),
+                "day" : Text()
+                }
+            )
+
+    class Meta:
+        index = "course_test"
+
+def toSection(section):
+    return {
+             "sem" : section.sem,
+             "prof" : section.prof,
+             "loc" : section.loc,
+             "time" : section.time,
+             "day" : section.day
+           }
+
+
+def toBook(book):
+    title, author, price = book
+    return {
+            "title"  : title,
+            "author" : author,
+            "price"  : price
+            }
+
+def indexCourse(course):
+    print "Trying to index course %s" % course
+    new_course = Course(sections=map(toSection, course.sections),
+                        books=map(toBook, course.books),
+                        title=course.title,
+                        dept=course.dept,
+                        code=course.code)
+    new_course.save()
+
+
+#if __name__ == "__main__":
+    #Course.init()
--- a/crawler/mcmaster.rkt
+++ b/crawler/mcmaster.rkt
@ -0,0 +1,6 @@
+#! /usr/bin/env racket
+#lang racket
+
+(require net/url)
+
+
--- a/crawler/output
+++ b/crawler/output
--- a/crawler/requirements.txt
+++ b/crawler/requirements.txt
@ -0,0 +1,22 @@
+appdirs==1.4.0
+click==6.7
+dominate==2.3.1
+elasticsearch==5.1.0
+elasticsearch-dsl==5.1.0
+Flask==0.12
+flask-appconfig==0.11.1
+Flask-Bootstrap==3.3.7.1
+itsdangerous==0.24
+Jinja2==2.9.4
+lxml==3.7.2
+MarkupSafe==0.23
+packaging==16.8
+pyparsing==2.1.10
+python-dateutil==2.6.0
+python-memcached==1.58
+requests==2.13.0
+six==1.10.0
+urllib3==1.20
+uWSGI==2.0.14
+visitor==0.1.3
+Werkzeug==0.11.15
--- a/src/mcmaster/site.py
+++ b/src/mcmaster/site.py
--- a/src/mcmaster/spreadsheet.py
+++ b/src/mcmaster/spreadsheet.py
@ -1,4 +1,4 @@
-#! /usr/bin/python3
+#! /usr/bin/python2

 from classes import fallCourses
 import csv
--- a/src/mcmaster/sylla.py
+++ b/src/mcmaster/sylla.py
--- a/src/mcmaster/courses.csv
+++ b/src/mcmaster/courses.csv
--- a/src/mcmaster/courses.ods
+++ b/src/mcmaster/courses.ods
--- a/src/mcmaster/fall_course_schedule.ods
+++ b/src/mcmaster/fall_course_schedule.ods
--- a/src/mcmaster/schedule.csv
+++ b/src/mcmaster/schedule.csv
--- a/src/mcmaster/texbooks.csv
+++ b/src/mcmaster/texbooks.csv
--- a/src/mcmaster/textbooks.csv
+++ b/src/mcmaster/textbooks.csv