Skip to content

Instantly share code, notes, and snippets.

@alxarch
Last active August 29, 2015 14:05
Show Gist options
  • Save alxarch/ce38f1cea4694ed92463 to your computer and use it in GitHub Desktop.
Save alxarch/ce38f1cea4694ed92463 to your computer and use it in GitHub Desktop.
PhantomJS scripts to interact with service.eudoxus.gr
node_modules/

Usage:

% coffee crawl.coffee 2014 | mysql -u root -D eudoxus -p
###
PhantomJS script to collect book info from service.eudoxus.gr
###
usage = (code) ->
console.log """
Usage: phantomjs bookinfo.coffee <book_id>
"""
phantom.exit code
id = require("system").args[1]
usage(1) unless id
pg = require("webpage").create()
pg.onCallback = (data) ->
console.log JSON.stringify data
phantom.exit()
pg.open "https://service.eudoxus.gr/search/", (status) ->
phantom.exit(1) unless status is "success"
pg.evaluate (id) ->
$ = (sel, el = document.body) -> el.querySelector sel
$$ = (sel, el = document.body) -> [].slice.apply el.querySelectorAll sel
waitForElement = (sel, ms, callback) ->
fn = ->
result = $ sel
if result?
clearInterval interval
callback result
interval = setInterval fn, ms
click = (el) ->
event = document.createEvent "MouseEvents"
event.initEvent "click", yes, yes
el.dispatchEvent event, yes
event
parseBookDetailsPopup = (popup) ->
covers = $$(".search-popup-left .gwt-Image", popup).map (img) -> img.getAttribute "src"
pdfs = $$(".search-popup-left .search-popup-link", popup).map (link) -> "#{link.href}"
fields = {}
$$(".search-popup-details-table td:nth-child(1)").forEach (td) ->
fields[td.textContent] = td.nextElementSibling?.textContent
["description", "authors", "title", "subtitle"].forEach (key) ->
fields[key] = $(".search-popup-#{key}", popup)?.textContent
code: id
description: fields.description
title: fields.title
subtitle: fields.subtitle
authors: fields.authors.replace "Συγγραφείς: ", ""
edition: fields["Αριθμός Έκδοσης"]?.replace " εκδ.", ""
year: fields["Έτος Έκδοσης"]
keywords: fields["Λέξεις κλειδιά"]
topics: fields["Θεματικές Ενότητες"]
isbn: fields["ISBN"]
publisher: fields["Εκδόσεις"]
distributor: fields["Διαθέτης (Εκδότης)"]
type: fields["Τύπος"]
covertype: fields["Δέσιμο"]
pages: fields["Αριθμός Σελίδων"]
dimensions: fields["Διαστάσεις"]
url: $(".search-popup-details-table td .gwt-Anchor", popup)?.href
cover: covers[0]
backcover: covers[1]
toc: pdfs[0]
sample: pdfs[1]
waitForElement ".search-resultsPanel .search-hyperlink", 50, (link) ->
click link
waitForElement ".search-popup", 10, (popup) ->
data = parseBookDetailsPopup popup
window.callPhantom data
window.location.hash = "a/id:#{id}/0"
, id
#!/usr/bin/env coffee
###
# Crawls service.eudoxus.gr to get all book selections for a year
# also ffetches book info
# outputs mysql statements that be piped into mysql directly or to a file
# Example:
# coffee crawler.coffee 2014 | mysql -u username -p somedbname
###
year = parseInt process.argv[2], 10
process.exit(1) unless year
fs = require "fs"
path = require "path"
async = require "async"
{exec} = require "child_process"
{format} = require "util"
simplecrawler = require "simplecrawler"
cheerio = require "cheerio"
phantomjs = require "phantomjs"
output = process.stdout
output.write """
CREATE TABLE IF NOT EXISTS `selections` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
`book_desc` text COLLATE utf8_unicode_ci NOT NULL,
`position` int(11) NOT NULL,
`year` int(11) NOT NULL,
`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
`course_semester` int(11) NOT NULL,
PRIMARY KEY (`id`),
KEY `book_code` (`book_code`),
KEY `dept_code` (`dept_code`),
KEY `position` (`position`),
KEY `year` (`year`),
KEY `course_code` (`course_code`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;
CREATE TABLE IF NOT EXISTS `books` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`code` varchar(45) DEFAULT NULL,
`isbn` varchar(45) DEFAULT NULL,
`title` varchar(255) DEFAULT NULL,
`subtitle` varchar(255) DEFAULT NULL,
`authors` varchar(255) DEFAULT NULL,
`description` text,
`publisher` varchar(255) DEFAULT NULL,
`distributor` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
`sample` varchar(255) DEFAULT NULL,
`toc` varchar(255) DEFAULT NULL,
`cover` varchar(255) DEFAULT NULL,
`backcover` varchar(255) DEFAULT NULL,
`dimensions` varchar(45) DEFAULT NULL,
`topics` text,
`keywords` text,
`type` varchar(45) DEFAULT NULL,
`pages` varchar(45) DEFAULT NULL,
`edition` varchar(45) DEFAULT NULL,
`year` varchar(45) DEFAULT NULL,
`covertype` varchar(45) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `book_code_idx` (`code`),
KEY `book_isbn_idx` (`isbn`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;
DELETE FROM selections WHERE year = #{year};
"""
insert = (table, obj) ->
quote = (value) -> "\"#{value.replace /"/g, '\\"'}\""
keys = Object.keys obj
values = (quote("#{obj[key]}") for key in keys)
"""
INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','});
"""
rx =
url: new RegExp "^/public/departments/courses/(\\d+)/#{year}$"
course: /^Μάθημα \[([^\]]+)\]: (.*)/
book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/
semester: /^Εξάμηνο (\d+) - (Χειμερινό|Εαρινό|Ετήσιο)$/
crawler = new simplecrawler "service.eudoxus.gr"
crawler.initialPath = "/public/departments"
crawler.initialProtocol = "https"
crawler.maxConcurrency = 4
crawler.addFetchCondition (url) -> rx.url.test url.path
book_queue = do ->
script = path.join __dirname, "bookinfo.coffee"
processed = {}
worker = (book_id, done) ->
if processed[book_id]?
done()
else
options =
timeout: 30000
exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) ->
if err?
book_queue.push book_id
else
book = null
try
book = JSON.parse stdout
if book?
output.write "DELETE FROM books WHERE code = '#{book_id}';\n"
output.write insert "books", book
done()
async.queue worker, 8
crawler.on "fetchcomplete", (item, html, response) ->
dept_code = item.path.replace rx.url, "$1"
$ = cheerio.load html,
decodeEntities: yes
dept_name = "#{$("#header > h2").first().text()} | #{$("#header > h2").last().text()}"
$("ol > li > ul > li").each ->
$li = $ @
$ol = $li.closest "ol"
course_text = $ol.prevAll("h2").first().text()
semester_text = $ol.prevAll("h3").first().text()
book_text = $li.text().replace("\n", " ")
position = $li.parent().parent().prevAll().length + 1
book =
year: year
dept_code: dept_code
course_code: course_text.replace(rx.course, "$1")
book_code: book_text.replace(rx.book, "$1")
course_name: course_text.replace(rx.course, "$2")
dept_name: dept_name
course_semester: semester_text.replace(rx.semester, "$1")
course_season: semester_text.replace(rx.semester, "$2")
book_desc: book_text.replace(rx.book, "$2")
position: position
book_queue.push book.book_code
output.write insert "selections", book
crawler.on "complete", ->
book_queue.drain = -> process.exit()
crawler.start()
{
"name": "eudoxus",
"version": "0.0.0",
"description": "",
"main": "bookinfo.js",
"dependencies": {
"async": "^0.9.0",
"cheerio": "^0.17.0",
"phantomjs": "^1.9.7-15",
"simplecrawler": "^0.3.9"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"repository": {
"type": "git",
"url": "[email protected]:/ce38f1cea4694ed92463.git"
},
"author": "",
"license": "ISC"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment