alxarch · August 29, 2015 14:05
diff --git a/.gitignore b/.gitignore
 node_modules/
diff --git a/README.md b/README.md
diff --git a/bookinfo.coffee b/bookinfo.coffee
 ###
 PhantomJS script to collect book info from service.eudoxus.gr
 ###

 usage = (code) ->
 	console.log """
 	Usage: phantomjs bookinfo.coffee <book_id>
 	"""
 	phantom.exit code

 id = require("system").args[1]
 usage(1) unless id

 pg = require("webpage").create()
 pg.onCallback = (data) ->
 	console.log JSON.stringify data
 	phantom.exit()

 pg.open "https://service.eudoxus.gr/search/", (status) ->
 	phantom.exit(1) unless status is "success"

 	pg.evaluate (id) ->
 		$ = (sel, el = document.body) -> el.querySelector sel
 		$$ = (sel, el = document.body) -> [].slice.apply el.querySelectorAll sel
 		waitForElement = (sel, ms, callback) ->
 			fn = ->
 				result = $ sel
 				if result?
 					clearInterval interval
 					callback result

 			interval = setInterval fn, ms

 		click = (el) ->
 			event = document.createEvent "MouseEvents"
 			event.initEvent "click", yes, yes
 			el.dispatchEvent event, yes
 			event

 		parseBookDetailsPopup = (popup) ->
 			covers = $$(".search-popup-left .gwt-Image", popup).map (img) -> img.getAttribute "src"
 			pdfs = $$(".search-popup-left .search-popup-link", popup).map (link) -> "#{link.href}"
 			fields = {}

 			$$(".search-popup-details-table td:nth-child(1)").forEach (td) ->
 				fields[td.textContent] = td.nextElementSibling?.textContent

 			["description", "authors", "title", "subtitle"].forEach (key) ->
 				fields[key] = $(".search-popup-#{key}", popup)?.textContent

 			code:         id
 			description:  fields.description
 			title:        fields.title
 			subtitle:     fields.subtitle
 			authors:      fields.authors.replace "Συγγραφείς: ", ""
 			edition:      fields["Αριθμός Έκδοσης"]?.replace " εκδ.", ""
 			year:         fields["Έτος Έκδοσης"]
 			keywords:     fields["Λέξεις κλειδιά"]
 			topics:       fields["Θεματικές Ενότητες"]
 			isbn:         fields["ISBN"]
 			publisher:    fields["Εκδόσεις"]
 			distributor:  fields["Διαθέτης (Εκδότης)"]
 			type:         fields["Τύπος"]
 			covertype:    fields["Δέσιμο"]
 			pages:        fields["Αριθμός Σελίδων"]
 			dimensions:   fields["Διαστάσεις"]
 			url:          $(".search-popup-details-table td .gwt-Anchor", popup)?.href
 			cover:        covers[0]
 			backcover:    covers[1]
 			toc:          pdfs[0]
 			sample:       pdfs[1]

 		waitForElement ".search-resultsPanel .search-hyperlink", 50, (link) ->
 			click link
 			waitForElement ".search-popup", 10, (popup) ->
 				data = parseBookDetailsPopup popup
 				window.callPhantom data

 		window.location.hash = "a/id:#{id}/0"
 	, id	
diff --git a/crawl.coffee b/crawl.coffee
 #!/usr/bin/env coffee
 ###
 # Crawls service.eudoxus.gr to get all book selections for a year
 # also ffetches book info
 # outputs mysql statements that be piped into mysql directly or to a file
 # Example:
 # coffee crawler.coffee 2014 | mysql -u username -p somedbname
 ###

 year = parseInt process.argv[2], 10
 process.exit(1) unless year

 fs = require "fs"
 path = require "path"
 async = require "async"
 {exec} = require "child_process"
 {format} = require "util"
 simplecrawler = require "simplecrawler"
 cheerio = require "cheerio"
 phantomjs = require "phantomjs"

 output = process.stdout
 output.write """
 CREATE TABLE IF NOT EXISTS `selections` (
 	`id` int(11) NOT NULL AUTO_INCREMENT,
 	`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 	`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
 	`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
 	`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
 	`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 	`book_desc` text COLLATE utf8_unicode_ci NOT NULL,
 	`position` int(11) NOT NULL,
 	`year` int(11) NOT NULL,
 	`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
 	`course_semester` int(11) NOT NULL,
 	PRIMARY KEY (`id`),
 	KEY `book_code` (`book_code`),
 	KEY `dept_code` (`dept_code`),
 	KEY `position` (`position`),
 	KEY `year` (`year`),
 	KEY `course_code` (`course_code`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;

 CREATE TABLE IF NOT	EXISTS `books` (
 	`id` int(11) NOT NULL AUTO_INCREMENT,
 	`code` varchar(45) DEFAULT NULL,
 	`isbn` varchar(45) DEFAULT NULL,
 	`title` varchar(255) DEFAULT NULL,
 	`subtitle` varchar(255) DEFAULT NULL,
 	`authors` varchar(255) DEFAULT NULL,
 	`description` text,
 	`publisher` varchar(255) DEFAULT NULL,
 	`distributor` varchar(255) DEFAULT NULL,
 	`url` varchar(255) DEFAULT NULL,
 	`sample` varchar(255) DEFAULT NULL,
 	`toc` varchar(255) DEFAULT NULL,
 	`cover` varchar(255) DEFAULT NULL,
 	`backcover` varchar(255) DEFAULT NULL,
 	`dimensions` varchar(45) DEFAULT NULL,
 	`topics` text,
 	`keywords` text,
 	`type` varchar(45) DEFAULT NULL,
 	`pages` varchar(45) DEFAULT NULL,
 	`edition` varchar(45) DEFAULT NULL,
 	`year` varchar(45) DEFAULT NULL,
 	`covertype` varchar(45) DEFAULT NULL,
 	PRIMARY KEY (`id`),
 	KEY `book_code_idx` (`code`),
 	KEY `book_isbn_idx` (`isbn`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;


 DELETE FROM selections WHERE year = #{year};

 """

 insert = (table, obj) ->
 	quote = (value) -> "\"#{value.replace /"/g, '\\"'}\""
 	keys = Object.keys obj
 	values = (quote("#{obj[key]}") for key in keys)
 	"""
 	INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','});

 	"""
 rx =
 	url:  new RegExp "^/public/departments/courses/(\\d+)/#{year}$"
 	course: /^Μάθημα \[([^\]]+)\]: (.*)/
 	book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/
 	semester: /^Εξάμηνο (\d+) - (Χειμερινό|Εαρινό|Ετήσιο)$/

 crawler = new simplecrawler "service.eudoxus.gr"
 crawler.initialPath = "/public/departments"
 crawler.initialProtocol = "https"
 crawler.maxConcurrency = 4
 crawler.addFetchCondition (url) -> rx.url.test url.path

 book_queue = do ->
 	script = path.join __dirname, "bookinfo.coffee"
 	processed = {}
 	worker = (book_id, done) ->
 		if processed[book_id]?
 			done()
 		else
 			options =
 				timeout: 30000
 			exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) ->
 				if err?
 					book_queue.push book_id
 				else
 					book = null
 					try
 						book = JSON.parse stdout
 					if book?
 						output.write "DELETE FROM books WHERE code = '#{book_id}';\n"
 						output.write insert "books", book
 				done()
 	async.queue worker, 8

 crawler.on "fetchcomplete", (item, html, response) ->
 	dept_code = item.path.replace rx.url, "$1"
 	$ = cheerio.load html,
 		decodeEntities: yes
 	dept_name = "#{$("#header > h2").first().text()} | #{$("#header > h2").last().text()}"
 	$("ol > li > ul > li").each ->
 		$li = $ @
 		$ol = $li.closest "ol"
 		course_text = $ol.prevAll("h2").first().text()
 		semester_text = $ol.prevAll("h3").first().text()
 		book_text = $li.text().replace("\n", " ")
 		position = $li.parent().parent().prevAll().length + 1

 		book =
 			year:             year
 			dept_code:        dept_code
 			course_code:      course_text.replace(rx.course, "$1")
 			book_code:        book_text.replace(rx.book, "$1")
 			course_name:      course_text.replace(rx.course, "$2")
 			dept_name:        dept_name
 			course_semester:  semester_text.replace(rx.semester, "$1")
 			course_season:    semester_text.replace(rx.semester, "$2")
 			book_desc:        book_text.replace(rx.book, "$2")
 			position:         position

 		book_queue.push book.book_code
 		output.write insert "selections", book

 crawler.on "complete", ->
 	book_queue.drain = -> process.exit()

 crawler.start()
diff --git a/package.json b/package.json
 {
  "name": "eudoxus",
  "version": "0.0.0",
  "description": "",
  "main": "bookinfo.js",
  "dependencies": {
    "async": "^0.9.0",
    "cheerio": "^0.17.0",
    "phantomjs": "^1.9.7-15",
    "simplecrawler": "^0.3.9"
  },
  "devDependencies": {},
  "scripts": {
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "repository": {
    "type": "git",
    "url": "[email protected]:/ce38f1cea4694ed92463.git"
  },
  "author": "",
  "license": "ISC"
 }
	###
	PhantomJS script to collect book info from service.eudoxus.gr
	###

	usage = (code) ->
	console.log """
	Usage: phantomjs bookinfo.coffee <book_id>
	"""
	phantom.exit code

	id = require("system").args[1]
	usage(1) unless id

	pg = require("webpage").create()
	pg.onCallback = (data) ->
	console.log JSON.stringify data
	phantom.exit()

	pg.open "https://service.eudoxus.gr/search/", (status) ->
	phantom.exit(1) unless status is "success"

	pg.evaluate (id) ->
	$ = (sel, el = document.body) -> el.querySelector sel
	$$ = (sel, el = document.body) -> [].slice.apply el.querySelectorAll sel
	waitForElement = (sel, ms, callback) ->
	fn = ->
	result = $ sel
	if result?
	clearInterval interval
	callback result

	interval = setInterval fn, ms

	click = (el) ->
	event = document.createEvent "MouseEvents"
	event.initEvent "click", yes, yes
	el.dispatchEvent event, yes
	event

	parseBookDetailsPopup = (popup) ->
	covers = $$(".search-popup-left .gwt-Image", popup).map (img) -> img.getAttribute "src"
	pdfs = $$(".search-popup-left .search-popup-link", popup).map (link) -> "#{link.href}"
	fields = {}

	$$(".search-popup-details-table td:nth-child(1)").forEach (td) ->
	fields[td.textContent] = td.nextElementSibling?.textContent

	["description", "authors", "title", "subtitle"].forEach (key) ->
	fields[key] = $(".search-popup-#{key}", popup)?.textContent

	code: id
	description: fields.description
	title: fields.title
	subtitle: fields.subtitle
	authors: fields.authors.replace "Συγγραφείς: ", ""
	edition: fields["Αριθμός Έκδοσης"]?.replace " εκδ.", ""
	year: fields["Έτος Έκδοσης"]
	keywords: fields["Λέξεις κλειδιά"]
	topics: fields["Θεματικές Ενότητες"]
	isbn: fields["ISBN"]
	publisher: fields["Εκδόσεις"]
	distributor: fields["Διαθέτης (Εκδότης)"]
	type: fields["Τύπος"]
	covertype: fields["Δέσιμο"]
	pages: fields["Αριθμός Σελίδων"]
	dimensions: fields["Διαστάσεις"]
	url: $(".search-popup-details-table td .gwt-Anchor", popup)?.href
	cover: covers[0]
	backcover: covers[1]
	toc: pdfs[0]
	sample: pdfs[1]

	waitForElement ".search-resultsPanel .search-hyperlink", 50, (link) ->
	click link
	waitForElement ".search-popup", 10, (popup) ->
	data = parseBookDetailsPopup popup
	window.callPhantom data

	window.location.hash = "a/id:#{id}/0"
	, id
	#!/usr/bin/env coffee
	###
	# Crawls service.eudoxus.gr to get all book selections for a year
	# also ffetches book info
	# outputs mysql statements that be piped into mysql directly or to a file
	# Example:
	# coffee crawler.coffee 2014 \| mysql -u username -p somedbname
	###

	year = parseInt process.argv[2], 10
	process.exit(1) unless year

	fs = require "fs"
	path = require "path"
	async = require "async"
	{exec} = require "child_process"
	{format} = require "util"
	simplecrawler = require "simplecrawler"
	cheerio = require "cheerio"
	phantomjs = require "phantomjs"

	output = process.stdout
	output.write """
	CREATE TABLE IF NOT EXISTS `selections` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`course_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`course_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`book_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_code` varchar(100) COLLATE utf8_unicode_ci NOT NULL,
	`dept_name` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
	`book_desc` text COLLATE utf8_unicode_ci NOT NULL,
	`position` int(11) NOT NULL,
	`year` int(11) NOT NULL,
	`course_season` varchar(50) COLLATE utf8_unicode_ci NOT NULL,
	`course_semester` int(11) NOT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code` (`book_code`),
	KEY `dept_code` (`dept_code`),
	KEY `position` (`position`),
	KEY `year` (`year`),
	KEY `course_code` (`course_code`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;

	CREATE TABLE IF NOT EXISTS `books` (
	`id` int(11) NOT NULL AUTO_INCREMENT,
	`code` varchar(45) DEFAULT NULL,
	`isbn` varchar(45) DEFAULT NULL,
	`title` varchar(255) DEFAULT NULL,
	`subtitle` varchar(255) DEFAULT NULL,
	`authors` varchar(255) DEFAULT NULL,
	`description` text,
	`publisher` varchar(255) DEFAULT NULL,
	`distributor` varchar(255) DEFAULT NULL,
	`url` varchar(255) DEFAULT NULL,
	`sample` varchar(255) DEFAULT NULL,
	`toc` varchar(255) DEFAULT NULL,
	`cover` varchar(255) DEFAULT NULL,
	`backcover` varchar(255) DEFAULT NULL,
	`dimensions` varchar(45) DEFAULT NULL,
	`topics` text,
	`keywords` text,
	`type` varchar(45) DEFAULT NULL,
	`pages` varchar(45) DEFAULT NULL,
	`edition` varchar(45) DEFAULT NULL,
	`year` varchar(45) DEFAULT NULL,
	`covertype` varchar(45) DEFAULT NULL,
	PRIMARY KEY (`id`),
	KEY `book_code_idx` (`code`),
	KEY `book_isbn_idx` (`isbn`)
	) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci AUTO_INCREMENT=1;


	DELETE FROM selections WHERE year = #{year};

	"""

	insert = (table, obj) ->
	quote = (value) -> "\"#{value.replace /"/g, '\\"'}\""
	keys = Object.keys obj
	values = (quote("#{obj[key]}") for key in keys)
	"""
	INSERT INTO #{table} (#{keys.join ','}) VALUES (#{values.join ','});

	"""
	rx =
	url: new RegExp "^/public/departments/courses/(\\d+)/#{year}$"
	course: /^Μάθημα \[([^\]]+)\]: (.*)/
	book: /^Βιβλίο \[([^\]]+)\]: (.*)Λεπτομέρειες$/
	semester: /^Εξάμηνο (\d+) - (Χειμερινό\|Εαρινό\|Ετήσιο)$/

	crawler = new simplecrawler "service.eudoxus.gr"
	crawler.initialPath = "/public/departments"
	crawler.initialProtocol = "https"
	crawler.maxConcurrency = 4
	crawler.addFetchCondition (url) -> rx.url.test url.path

	book_queue = do ->
	script = path.join __dirname, "bookinfo.coffee"
	processed = {}
	worker = (book_id, done) ->
	if processed[book_id]?
	done()
	else
	options =
	timeout: 30000
	exec "#{phantomjs.path} #{script} #{book_id}", options, (err, stdout) ->
	if err?
	book_queue.push book_id
	else
	book = null
	try
	book = JSON.parse stdout
	if book?
	output.write "DELETE FROM books WHERE code = '#{book_id}';\n"
	output.write insert "books", book
	done()
	async.queue worker, 8

	crawler.on "fetchcomplete", (item, html, response) ->
	dept_code = item.path.replace rx.url, "$1"
	$ = cheerio.load html,
	decodeEntities: yes
	dept_name = "#{$("#header > h2").first().text()} \| #{$("#header > h2").last().text()}"
	$("ol > li > ul > li").each ->
	$li = $ @
	$ol = $li.closest "ol"
	course_text = $ol.prevAll("h2").first().text()
	semester_text = $ol.prevAll("h3").first().text()
	book_text = $li.text().replace("\n", " ")
	position = $li.parent().parent().prevAll().length + 1

	book =
	year: year
	dept_code: dept_code
	course_code: course_text.replace(rx.course, "$1")
	book_code: book_text.replace(rx.book, "$1")
	course_name: course_text.replace(rx.course, "$2")
	dept_name: dept_name
	course_semester: semester_text.replace(rx.semester, "$1")
	course_season: semester_text.replace(rx.semester, "$2")
	book_desc: book_text.replace(rx.book, "$2")
	position: position

	book_queue.push book.book_code
	output.write insert "selections", book

	crawler.on "complete", ->
	book_queue.drain = -> process.exit()

	crawler.start()
	{
	"name": "eudoxus",
	"version": "0.0.0",
	"description": "",
	"main": "bookinfo.js",
	"dependencies": {
	"async": "^0.9.0",
	"cheerio": "^0.17.0",
	"phantomjs": "^1.9.7-15",
	"simplecrawler": "^0.3.9"
	},
	"devDependencies": {},
	"scripts": {
	"test": "echo \"Error: no test specified\" && exit 1"
	},
	"repository": {
	"type": "git",
	"url": "[email protected]:/ce38f1cea4694ed92463.git"
	},
	"author": "",
	"license": "ISC"
	}