Browse Source

update crawler

master
wes 8 years ago
parent
commit
348057fbf0
  1. 0
      crawler/__init__.py
  2. 29
      crawler/classes.py
  3. 73
      crawler/mapping.py
  4. 6
      crawler/mcmaster.rkt
  5. 8743
      crawler/output
  6. 22
      crawler/requirements.txt
  7. 0
      crawler/site.py
  8. 2
      crawler/spreadsheet.py
  9. 0
      crawler/sylla.py
  10. 3851
      src/mcmaster/courses.csv
  11. BIN
      src/mcmaster/courses.ods
  12. BIN
      src/mcmaster/fall_course_schedule.ods
  13. 0
      src/mcmaster/schedule.csv
  14. 0
      src/mcmaster/texbooks.csv
  15. 0
      src/mcmaster/textbooks.csv

0
src/mcmaster/__init__.py → crawler/__init__.py

29
src/mcmaster/classes.py → crawler/classes.py

@ -1,11 +1,12 @@
#! /usr/bin/python2 #! /usr/bin/env python2
from mapping import indexCourse
from sys import argv from sys import argv
from itertools import chain, islice from itertools import chain, islice
from re import search, sub from re import search, sub
from functools import total_ordering from functools import total_ordering
from sylla2 import textbookInfo from sylla import textbookInfo
from collections import MutableMapping from collections import MutableMapping
import datetime as dt import datetime as dt
@ -110,7 +111,6 @@ class Class(object):
@property @property
def books(self): def books(self):
if self.dept and self.code: if self.dept and self.code:
print "tryna get some textbooks man"
return textbookInfo(self.dept, self.code, withPrices=True) return textbookInfo(self.dept, self.code, withPrices=True)
return False return False
@ -204,7 +204,6 @@ def getSectionInfo(table):
yield parseSection(tr) yield parseSection(tr)
def parseColumns(subject, html): def parseColumns(subject, html):
print type(html)
parsed = lxh.fromstring(html) parsed = lxh.fromstring(html)
classInfo = (list(getSectionInfo(table)) for table in classInfo = (list(getSectionInfo(table)) for table in
@ -288,9 +287,11 @@ class MosReq(object):
def codes(self): def codes(self):
if not self.codes_: if not self.codes_:
self.codes_ = list(chain.from_iterable( self.codes_ = list(chain.from_iterable(
list(map((lambda l: list(
self.getCodes(chr(l))), map(
range(65, 91))))) (lambda l:
self.getCodes(chr(l))),
range(65, 91)))))
return self.codes_ return self.codes_
def request(codes, lists, semester): def request(codes, lists, semester):
@ -299,8 +300,6 @@ def request(codes, lists, semester):
code = codes.get() code = codes.get()
lists.put(requester.classes(code)) lists.put(requester.classes(code))
codes.task_done() codes.task_done()
print "WHUT"
print "DONE"
class CourseInfo(object): class CourseInfo(object):
def __init__(self, threadcount, semester): def __init__(self, threadcount, semester):
@ -328,11 +327,8 @@ class CourseInfo(object):
thread.start() thread.start()
qcodes.join() qcodes.join()
for t in threads: for t in threads:
print t
t.join() t.join()
print "finished getting courses"
sections = [] sections = []
while not lists.empty(): while not lists.empty():
sections.append(lists.get()) sections.append(lists.get())
@ -349,14 +345,15 @@ class CourseInfo(object):
new_sections.append(sec) new_sections.append(sec)
yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections)) yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections))
def getCourses(semester, threadcount=10): def getCourses(semester, threadcount=5):
return CourseInfo(threadcount, semester).classes() return CourseInfo(threadcount, semester).classes()
def allCourses(): def allCourses():
return chain.from_iterable( return chain.from_iterable(
(getCourses(sem, threadcount=25) (getCourses(sem, threadcount=5)
for sem in (fall, winter, spring_summer))) for sem in [spring_summer, fall, winter]))
if __name__ == "__main__": if __name__ == "__main__":
for course in allCourses(): for course in allCourses():
sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else []))) indexCourse(course)
#sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))

73
crawler/mapping.py

@ -0,0 +1,73 @@
#! /usr/bin/env python2
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Object
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])
class TextBook(InnerObjectWrapper):
pass
class Section(InnerObjectWrapper):
pass
class Course(DocType):
title = Text()
dept = Text()
code = Keyword()
books = Object(
doc_class=TextBook,
properties = {
"author" : Text(),
"title" : Text(),
"price" : Text()
}
)
sections = Object(
doc_class=Section,
properties = {
"sem" : Keyword(),
"prof" : Text(),
"loc" : Text(),
"time" : Text(),
"day" : Text()
}
)
class Meta:
index = "course_test"
def toSection(section):
return {
"sem" : section.sem,
"prof" : section.prof,
"loc" : section.loc,
"time" : section.time,
"day" : section.day
}
def toBook(book):
title, author, price = book
return {
"title" : title,
"author" : author,
"price" : price
}
def indexCourse(course):
print "Trying to index course %s" % course
new_course = Course(sections=map(toSection, course.sections),
books=map(toBook, course.books),
title=course.title,
dept=course.dept,
code=course.code)
new_course.save()
#if __name__ == "__main__":
#Course.init()

6
crawler/mcmaster.rkt

@ -0,0 +1,6 @@
#! /usr/bin/env racket
#lang racket
(require net/url)

8743
crawler/output

File diff suppressed because it is too large

22
crawler/requirements.txt

@ -0,0 +1,22 @@
appdirs==1.4.0
click==6.7
dominate==2.3.1
elasticsearch==5.1.0
elasticsearch-dsl==5.1.0
Flask==0.12
flask-appconfig==0.11.1
Flask-Bootstrap==3.3.7.1
itsdangerous==0.24
Jinja2==2.9.4
lxml==3.7.2
MarkupSafe==0.23
packaging==16.8
pyparsing==2.1.10
python-dateutil==2.6.0
python-memcached==1.58
requests==2.13.0
six==1.10.0
urllib3==1.20
uWSGI==2.0.14
visitor==0.1.3
Werkzeug==0.11.15

0
src/mcmaster/site.py → crawler/site.py

2
src/mcmaster/spreadsheet.py → crawler/spreadsheet.py

@ -1,4 +1,4 @@
#! /usr/bin/python3 #! /usr/bin/python2
from classes import fallCourses from classes import fallCourses
import csv import csv

0
src/mcmaster/sylla.py → crawler/sylla.py

3851
src/mcmaster/courses.csv

File diff suppressed because it is too large

BIN
src/mcmaster/courses.ods

Binary file not shown.

BIN
src/mcmaster/fall_course_schedule.ods

Binary file not shown.

0
src/mcmaster/schedule.csv

0
src/mcmaster/texbooks.csv

0
src/mcmaster/textbooks.csv

Loading…
Cancel
Save