Browse Source

update crawler

master
wes 7 years ago
parent
commit
348057fbf0
  1. 0
      crawler/__init__.py
  2. 29
      crawler/classes.py
  3. 73
      crawler/mapping.py
  4. 6
      crawler/mcmaster.rkt
  5. 8743
      crawler/output
  6. 22
      crawler/requirements.txt
  7. 0
      crawler/site.py
  8. 2
      crawler/spreadsheet.py
  9. 0
      crawler/sylla.py
  10. 3851
      src/mcmaster/courses.csv
  11. BIN
      src/mcmaster/courses.ods
  12. BIN
      src/mcmaster/fall_course_schedule.ods
  13. 0
      src/mcmaster/schedule.csv
  14. 0
      src/mcmaster/texbooks.csv
  15. 0
      src/mcmaster/textbooks.csv

0
src/mcmaster/__init__.py → crawler/__init__.py

29
src/mcmaster/classes.py → crawler/classes.py

@ -1,11 +1,12 @@
#! /usr/bin/python2
#! /usr/bin/env python2
from mapping import indexCourse
from sys import argv
from itertools import chain, islice
from re import search, sub
from functools import total_ordering
from sylla2 import textbookInfo
from sylla import textbookInfo
from collections import MutableMapping
import datetime as dt
@ -110,7 +111,6 @@ class Class(object):
@property
def books(self):
if self.dept and self.code:
print "tryna get some textbooks man"
return textbookInfo(self.dept, self.code, withPrices=True)
return False
@ -204,7 +204,6 @@ def getSectionInfo(table):
yield parseSection(tr)
def parseColumns(subject, html):
print type(html)
parsed = lxh.fromstring(html)
classInfo = (list(getSectionInfo(table)) for table in
@ -288,9 +287,11 @@ class MosReq(object):
def codes(self):
if not self.codes_:
self.codes_ = list(chain.from_iterable(
list(map((lambda l:
self.getCodes(chr(l))),
range(65, 91)))))
list(
map(
(lambda l:
self.getCodes(chr(l))),
range(65, 91)))))
return self.codes_
def request(codes, lists, semester):
@ -299,8 +300,6 @@ def request(codes, lists, semester):
code = codes.get()
lists.put(requester.classes(code))
codes.task_done()
print "WHUT"
print "DONE"
class CourseInfo(object):
def __init__(self, threadcount, semester):
@ -328,11 +327,8 @@ class CourseInfo(object):
thread.start()
qcodes.join()
for t in threads:
print t
t.join()
print "finished getting courses"
sections = []
while not lists.empty():
sections.append(lists.get())
@ -349,14 +345,15 @@ class CourseInfo(object):
new_sections.append(sec)
yield Class(cl[0][0], sub("\xa0+", "", cl[0][1]), sorted(new_sections))
def getCourses(semester, threadcount=10):
def getCourses(semester, threadcount=5):
return CourseInfo(threadcount, semester).classes()
def allCourses():
return chain.from_iterable(
(getCourses(sem, threadcount=25)
for sem in (fall, winter, spring_summer)))
(getCourses(sem, threadcount=5)
for sem in [spring_summer, fall, winter]))
if __name__ == "__main__":
for course in allCourses():
sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))
indexCourse(course)
#sys.stdout.write("%s, %s, %s, %s\n" % (course.title, course.code, course.dept, list(chain.from_iterable(course.books) if course.books else [])))

73
crawler/mapping.py

@ -0,0 +1,73 @@
#! /usr/bin/env python2
from elasticsearch_dsl import DocType, Date, Nested, Boolean, \
analyzer, InnerObjectWrapper, Completion, Keyword, Text, Object
from elasticsearch_dsl.connections import connections
connections.create_connection(hosts=["localhost"])
class TextBook(InnerObjectWrapper):
pass
class Section(InnerObjectWrapper):
pass
class Course(DocType):
title = Text()
dept = Text()
code = Keyword()
books = Object(
doc_class=TextBook,
properties = {
"author" : Text(),
"title" : Text(),
"price" : Text()
}
)
sections = Object(
doc_class=Section,
properties = {
"sem" : Keyword(),
"prof" : Text(),
"loc" : Text(),
"time" : Text(),
"day" : Text()
}
)
class Meta:
index = "course_test"
def toSection(section):
return {
"sem" : section.sem,
"prof" : section.prof,
"loc" : section.loc,
"time" : section.time,
"day" : section.day
}
def toBook(book):
title, author, price = book
return {
"title" : title,
"author" : author,
"price" : price
}
def indexCourse(course):
print "Trying to index course %s" % course
new_course = Course(sections=map(toSection, course.sections),
books=map(toBook, course.books),
title=course.title,
dept=course.dept,
code=course.code)
new_course.save()
#if __name__ == "__main__":
#Course.init()

6
crawler/mcmaster.rkt

@ -0,0 +1,6 @@
#! /usr/bin/env racket
#lang racket
(require net/url)

8743
crawler/output

File diff suppressed because it is too large

22
crawler/requirements.txt

@ -0,0 +1,22 @@
appdirs==1.4.0
click==6.7
dominate==2.3.1
elasticsearch==5.1.0
elasticsearch-dsl==5.1.0
Flask==0.12
flask-appconfig==0.11.1
Flask-Bootstrap==3.3.7.1
itsdangerous==0.24
Jinja2==2.9.4
lxml==3.7.2
MarkupSafe==0.23
packaging==16.8
pyparsing==2.1.10
python-dateutil==2.6.0
python-memcached==1.58
requests==2.13.0
six==1.10.0
urllib3==1.20
uWSGI==2.0.14
visitor==0.1.3
Werkzeug==0.11.15

0
src/mcmaster/site.py → crawler/site.py

2
src/mcmaster/spreadsheet.py → crawler/spreadsheet.py

@ -1,4 +1,4 @@
#! /usr/bin/python3
#! /usr/bin/python2
from classes import fallCourses
import csv

0
src/mcmaster/sylla.py → crawler/sylla.py

3851
src/mcmaster/courses.csv

File diff suppressed because it is too large

BIN
src/mcmaster/courses.ods

Binary file not shown.

BIN
src/mcmaster/fall_course_schedule.ods

Binary file not shown.

0
src/mcmaster/schedule.csv

0
src/mcmaster/texbooks.csv

0
src/mcmaster/textbooks.csv

Loading…
Cancel
Save