|
#!/usr/bin/env coffee |
|
### |
|
# Crawls service.eudoxus.gr to get all book selections for a year |
|
# also ffetches book info |
|
# outputs mysql statements that be piped into mysql directly or to a file |
|
# Example: |
|
# coffee crawler.coffee 2014 | mysql -u username -p somedbname |
|
### |
|
|
|
year = parseInt process.argv[2], 10 |
|
process.exit(1) unless year |
|
|
|
fs = require "fs" |
|
path = require "path" |
|
async = require "async" |
|
{exec} = require "child_process" |
|
{format} = require "util" |
|
simplecrawler = require "simplecrawler" |
|
cheerio = require "cheerio" |
|
phantomjs = require "phantomjs" |
|
|
|
output = process.stdout |
|
output.write """ |
|
CREATE TABLE IF NOT EXISTS `selections` ( |
|
`id` int(11) NOT NULL AUTO_INCREMENT, |
|
`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL, |
|
`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL, |
|
`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL, |
|
`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL, |
|
`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL, |
|
`book_desc` text COLLATE utf8_unicode_ci NOT NULL, |
|
`position` int(11) NOT NULL, |
|
`year` int(11) NOT NULL, |
|
`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL, |
|
`course_semester` int(11) NOT NULL, |
|
PRIMARY KEY (`id`), |
|
KEY `book_code` (`book_code`), |
|
KEY `dept_code` (`dept_code`), |
|
KEY `position` (`position`), |
|
KEY `year` (`year`), |
|
KEY `course_code` (`course_code`) |
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1; |
|
|
|
CREATE TABLE IF NOT EXISTS `books` ( |
|
`id` int(11) NOT NULL AUTO_INCREMENT, |
|
`code` varchar(45) DEFAULT NULL, |
|
`isbn` varchar(45) DEFAULT NULL, |
|
`title` varchar(255) DEFAULT NULL, |
|
`subtitle` varchar(255) DEFAULT NULL, |
|
`authors` varchar(255) DEFAULT NULL, |
|
`description` text, |
|
`publisher` varchar(255) DEFAULT NULL, |
|
`distributor` varchar(255) DEFAULT NULL, |
|
`url` varchar(255) DEFAULT NULL, |
|
`sample` varchar(255) DEFAULT NULL, |
|
`toc` varchar(255) DEFAULT NULL, |
|
`cover` varchar(255) DEFAULT NULL, |
|
`backcover` varchar(255) DEFAULT NULL, |
|
`dimensions` varchar(45) DEFAULT NULL, |
|
`topics` text, |
|
`keywords` text, |
|
`type` varchar(45) DEFAULT NULL, |
|
`pages` varchar(45) DEFAULT NULL, |
|
`edition` varchar(45) DEFAULT NULL, |
|
`year` varchar(45) DEFAULT NULL, |
|
`covertype` varchar(45) DEFAULT NULL, |
|
PRIMARY KEY (`id`), |
|
KEY `book_code_idx` (`code`), |
|
KEY `book_isbn_idx` (`isbn`) |
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1; |
|
|
|
|
|
DELETE FROM selections WHERE year = #{year}; |
|
|
|
""" |
|
|
|
insert = (table, obj) -> |
|
quote = (value) -> "\"#{value.replace /"/g, '\\"'}\"" |
|
keys = Object.keys obj |
|
values = (quote("#{obj[key]}") for key in keys) |
|
""" |
|
INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','}); |
|
|
|
""" |
|
rx = |
|
url: new RegExp "^/public/departments/courses/(\\d+)/#{year}$" |
|
course: /^Μάθημα \[([^\]]+)\]: (.*)/ |
|
book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/ |
|
semester: /^Εξάμηνο (\d+) - (Χειμερινό|Εαρινό|Ετήσιο)$/ |
|
|
|
crawler = new simplecrawler "service.eudoxus.gr" |
|
crawler.initialPath = "/public/departments" |
|
crawler.initialProtocol = "https" |
|
crawler.maxConcurrency = 4 |
|
crawler.addFetchCondition (url) -> rx.url.test url.path |
|
|
|
book_queue = do -> |
|
script = path.join __dirname, "bookinfo.coffee" |
|
processed = {} |
|
worker = (book_id, done) -> |
|
if processed[book_id]? |
|
done() |
|
else |
|
options = |
|
timeout: 30000 |
|
exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) -> |
|
if err? |
|
book_queue.push book_id |
|
else |
|
book = null |
|
try |
|
book = JSON.parse stdout |
|
if book? |
|
output.write "DELETE FROM books WHERE code = '#{book_id}';\n" |
|
output.write insert "books", book |
|
done() |
|
async.queue worker, 8 |
|
|
|
crawler.on "fetchcomplete", (item, html, response) -> |
|
dept_code = item.path.replace rx.url, "$1" |
|
$ = cheerio.load html, |
|
decodeEntities: yes |
|
dept_name = "#{$("#header > h2").first().text()} | #{$("#header > h2").last().text()}" |
|
$("ol > li > ul > li").each -> |
|
$li = $ @ |
|
$ol = $li.closest "ol" |
|
course_text = $ol.prevAll("h2").first().text() |
|
semester_text = $ol.prevAll("h3").first().text() |
|
book_text = $li.text().replace("\n", " ") |
|
position = $li.parent().parent().prevAll().length + 1 |
|
|
|
book = |
|
year: year |
|
dept_code: dept_code |
|
course_code: course_text.replace(rx.course, "$1") |
|
book_code: book_text.replace(rx.book, "$1") |
|
course_name: course_text.replace(rx.course, "$2") |
|
dept_name: dept_name |
|
course_semester: semester_text.replace(rx.semester, "$1") |
|
course_season: semester_text.replace(rx.semester, "$2") |
|
book_desc: book_text.replace(rx.book, "$2") |
|
position: position |
|
|
|
book_queue.push book.book_code |
|
output.write insert "selections", book |
|
|
|
crawler.on "complete", -> |
|
book_queue.drain = -> process.exit() |
|
|
|
crawler.start() |