Created
April 20, 2011 19:01
-
-
Save adam3smith/932338 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID":"aa6ba023-4fbe-407c-b0c3-5997887db1eb", | |
"label":"VoxEU", | |
"creator":"Sebastian Karcher", | |
"target":"^https?://www\\.voxeu\\.org", | |
"minVersion":"1.0", | |
"maxVersion":"", | |
"priority":100, | |
"inRepository":"1", | |
"translatorType":4, | |
"lastUpdated":"2011-04-21 03:49:25" | |
} | |
/** | |
Copyright (c) 2010, Erik Hetzner | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
function flatten(a) { | |
var retval = new Array(); | |
for (var i in a) { | |
var entry = a[i]; | |
if (entry instanceof Array) { | |
retval = retval.concat(flatten(entry)); | |
} else { | |
retval.push(entry); | |
} | |
} | |
return retval; | |
} | |
/* Generic code */ | |
var FW = { | |
_scrapers : new Array() | |
}; | |
FW._Base = function () { | |
this.callHook = function (hookName, item, doc, url) { | |
if (typeof this['hooks'] === 'object') { | |
var hook = this['hooks'][hookName]; | |
if (typeof hook === 'function') { | |
hook(item, doc, url); | |
} | |
} | |
}; | |
this.evaluateThing = function(val, doc, url) { | |
var valtype = typeof val; | |
if (valtype === 'string') { | |
return val; | |
} else if (valtype === 'object') { | |
if (val instanceof Array) { | |
/* map over each array val */ | |
/* this.evaluate gets out of scope */ | |
var parentEval = this.evaluateThing; | |
var retval = val.map ( function(i) { return parentEval (i, doc, url); } ); | |
return flatten(retval); | |
} else { | |
return val.evaluate(doc, url); | |
} | |
} else if (valtype === 'function') { | |
return val(doc, url); | |
} else { | |
return undefined; | |
} | |
}; | |
this.evaluate = function (key, doc, url) { | |
return this.evaluateThing(this[key], doc, url); | |
}; | |
}; | |
FW.Scraper = function (init) { | |
FW._scrapers.push(new FW._Scraper(init)); | |
}; | |
FW._Scraper = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._singleFieldNames = [ | |
"abstractNote", | |
"applicationNumber", | |
"archive", | |
"archiveLocation", | |
"artworkMedium", | |
"artworkSize", | |
"assignee", | |
"audioFileType", | |
"audioRecordingType", | |
"billNumber", | |
"blogTitle", | |
"bookTitle", | |
"callNumber", | |
"caseName", | |
"code", | |
"codeNumber", | |
"codePages", | |
"codeVolume", | |
"committee", | |
"company", | |
"conferenceName", | |
"country", | |
"court", | |
"date", | |
"dateDecided", | |
"dateEnacted", | |
"dictionaryTitle", | |
"distributor", | |
"docketNumber", | |
"documentNumber", | |
"DOI", | |
"edition", | |
"encyclopediaTitle", | |
"episodeNumber", | |
"extra", | |
"filingDate", | |
"firstPage", | |
"forumTitle", | |
"genre", | |
"history", | |
"institution", | |
"interviewMedium", | |
"ISBN", | |
"ISSN", | |
"issue", | |
"issueDate", | |
"issuingAuthority", | |
"journalAbbreviation", | |
"label", | |
"language", | |
"legalStatus", | |
"legislativeBody", | |
"letterType", | |
"libraryCatalog", | |
"manuscriptType", | |
"mapType", | |
"medium", | |
"meetingName", | |
"nameOfAct", | |
"network", | |
"number", | |
"numberOfVolumes", | |
"numPages", | |
"pages", | |
"patentNumber", | |
"place", | |
"postType", | |
"presentationType", | |
"priorityNumbers", | |
"proceedingsTitle", | |
"programTitle", | |
"programmingLanguage", | |
"publicLawNumber", | |
"publicationTitle", | |
"publisher", | |
"references", | |
"reportNumber", | |
"reportType", | |
"reporter", | |
"reporterVolume", | |
"rights", | |
"runningTime", | |
"scale", | |
"section", | |
"series", | |
"seriesNumber", | |
"seriesText", | |
"seriesTitle", | |
"session", | |
"shortTitle", | |
"studio", | |
"subject", | |
"system", | |
"thesisType", | |
"title", | |
"type", | |
"university", | |
"url", | |
"version", | |
"videoRecordingType", | |
"volume", | |
"websiteTitle", | |
"websiteType" ]; | |
this.makeItems = function (doc, url) { | |
var item = new Zotero.Item(this.itemType); | |
item.url = url; | |
for (var i in this._singleFieldNames) { | |
var field = this._singleFieldNames[i]; | |
if (this[field]) { | |
var fieldVal = this.evaluate(field, doc, url); | |
if (fieldVal instanceof Array) { | |
item[field] = fieldVal[0]; | |
} else { | |
item[field] = fieldVal; | |
} | |
} | |
} | |
var multiFields = ["creators", "attachments", "tags"]; | |
for (var j in multiFields) { | |
var key = multiFields[j]; | |
var val = this.evaluate(key, doc, url); | |
if (val) { | |
for (var k in val) { | |
item[key].push(val[k]); | |
} | |
} | |
} | |
return [item]; | |
}; | |
}; | |
FW._Scraper.prototype = new FW._Base; | |
FW.MultiScraper = function (init) { | |
FW._scrapers.push(new FW._MultiScraper(init)); | |
}; | |
FW._MultiScraper = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._mkSelectItems = function(titles, urls) { | |
var items = new Object; | |
for (var i in titles) { | |
items[urls[i]] = titles[i]; | |
} | |
return items; | |
}; | |
this._selectItems = function(titles, urls) { | |
var items = new Array(); | |
for (var j in Zotero.selectItems(this._mkSelectItems(titles, urls))) { | |
items.push(j); | |
} | |
return items; | |
}; | |
this._mkAttachments = function(doc, url, urls) { | |
var attachmentsArray = this.evaluate('attachments', doc, url); | |
var attachmentsDict = new Object(); | |
if (attachmentsArray) { | |
for (var i in urls) { | |
attachmentsDict[urls[i]] = attachmentsArray[i]; | |
} | |
} | |
return attachmentsDict; | |
}; | |
this.makeItems = function(doc, url) { | |
Zotero.debug("Entering MultiScraper.makeItems"); | |
if (this.beforeFilter) { | |
var newurl = this.beforeFilter(doc, url); | |
if (newurl != url) { | |
return this.makeItems(Zotero.Utilities.retrieveDocument(url), newurl); | |
} | |
} | |
var titles = this.evaluate('titles', doc, url); | |
var urls = this.evaluate('urls', doc, url); | |
var itemsToUse = this._selectItems(titles, urls); | |
var attachments = this._mkAttachments(doc, url, urls); | |
if(!itemsToUse) { | |
Zotero.done(true); | |
return []; | |
} else { | |
var madeItems = new Array(); | |
for (var i in itemsToUse) { | |
var url1 = itemsToUse[i]; | |
var doc1 = Zotero.Utilities.retrieveDocument(url1); | |
var itemTrans; | |
if (this.itemTrans) { | |
itemTrans = this.itemTrans; | |
} else { | |
itemTrans = FW.getScraper(doc1, url1); | |
} | |
var items = itemTrans.makeItems(doc1, url1, attachments[url1]); | |
madeItems.push(items[0]); | |
} | |
return madeItems; | |
} | |
}; | |
}; | |
FW._MultiScraper.prototype = new FW._Base; | |
FW.DelegateTranslator = function (init) { | |
return new FW._DelegateTranslator(init); | |
}; | |
FW._DelegateTranslator = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._translator = Zotero.loadTranslator(this.translatorType); | |
this._translator.setTranslator(this.translatorId); | |
this.makeItems = function(doc, url, attachments) { | |
Zotero.debug("Entering DelegateTranslator.makeItems"); | |
var tmpItem; | |
var text = Zotero.Utilities.retrieveSource(url); | |
this._translator.setHandler("itemDone", function(obj, item) { | |
tmpItem = item; | |
/* this does not seem to be working */ | |
if (attachments) { item.attachments = attachments; } | |
}); | |
this._translator.setString(text); | |
this._translator.translate(); | |
Zotero.debug("Leaving DelegateTranslator.makeItems"); | |
return [tmpItem]; | |
}; | |
}; | |
FW.DelegateTranslator.prototype = new FW._Scraper; | |
FW._StringMagic = function () { | |
this._filters = new Array(); | |
this.addFilter = function(filter) { | |
this._filters.push(filter); | |
return this; | |
}; | |
this.split = function(re) { | |
return this.addFilter(function(s) { | |
return s.split(re).filter(function(e) { return (e != ""); }); | |
}); | |
}; | |
this.replace = function(s1, s2, flags) { | |
return this.addFilter(function(s) { | |
if (s.match(s1)) { | |
return s.replace(s1, s2, flags); | |
} else { | |
return s; | |
} | |
}); | |
}; | |
this.prepend = function(prefix) { | |
return this.replace(/^/, prefix); | |
}; | |
this.append = function(postfix) { | |
return this.replace(/$/, postfix); | |
}; | |
this.remove = function(toStrip, flags) { | |
return this.replace(toStrip, '', flags); | |
}; | |
this.trim = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.trim(s); }); | |
}; | |
this.trimInternal = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.trimInternal(s); }); | |
}; | |
this.match = function(re, group) { | |
if (!group) group = 0; | |
return this.addFilter(function(s) { | |
var m = s.match(re); | |
if (m === undefined) { return undefined; } | |
else { return m[group]; } | |
}); | |
}; | |
this.cleanAuthor = function(type, useComma) { | |
return this.addFilter(function(s) { return Zotero.Utilities.cleanAuthor(s, type, useComma); }); | |
}; | |
this.key = function(field) { | |
return this.addFilter(function(n) { return n[field]; }); | |
}; | |
this.capitalizeTitle = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.capitalizeTitle(s); }); | |
}; | |
this.unescapeHTML = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.unescapeHTML(s); }); | |
}; | |
this.unescape = function() { | |
return this.addFilter(function(s) { return unescape(s); }); | |
}; | |
this.makeAttachment = function(type, title) { | |
var filter = function(url) { | |
if (url) { | |
return { url : url, | |
type : type, | |
title : title }; | |
} else { | |
return undefined; | |
} | |
}; | |
return this.addFilter(filter); | |
}; | |
this._applyFilters = function(a, doc1) { | |
Zotero.debug("Entering StringMagic._applyFilters"); | |
for (i in this._filters) { | |
a = flatten(a); | |
/* remove undefined or null array entries */ | |
a = a.filter(function(x) { return ((x !== undefined) && (x !== null)); }); | |
for (var j = 0 ; j < a.length ; j++) { | |
try { | |
if ((a[j] === undefined) || (a[j] === null)) { continue; } | |
else { a[j] = this._filters[i](a[j], doc1); } | |
} catch (x) { | |
a[j] = undefined; | |
Zotero.debug("Caught exception " + x + "on filter: " + this._filters[i]); | |
} | |
} | |
/* remove undefined or null array entries */ | |
/* need this twice because they could have become undefined or null along the way */ | |
a = a.filter(function(x) { return ((x !== undefined) && (x !== null)); }); | |
} | |
return a; | |
}; | |
}; | |
FW.PageText = function () { | |
return new FW._PageText(); | |
}; | |
FW._PageText = function() { | |
this._filters = new Array(); | |
this.evaluate = function (doc) { | |
var a = [doc.documentElement.innerHTML]; | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._PageText.prototype = new FW._StringMagic(); | |
FW.Url = function () { return new FW._Url(); }; | |
FW._Url = function () { | |
this._filters = new Array(); | |
this.evaluate = function (doc, url) { | |
var a = [url]; | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._Url.prototype = new FW._StringMagic(); | |
FW.Xpath = function (xpathExpr) { return new FW._Xpath(xpathExpr); }; | |
FW._Xpath = function (_xpath) { | |
this._xpath = _xpath; | |
this._filters = new Array(); | |
this.text = function() { | |
var filter = function(n) { | |
if (typeof n === 'object' && n.textContent) { return n.textContent; } | |
else { return n; } | |
}; | |
this.addFilter(filter); | |
return this; | |
}; | |
this.sub = function(xpath) { | |
var filter = function(n, doc) { | |
var result = doc.evaluate(xpath, n, null, XPathResult.ANY_TYPE, null); | |
if (result) { | |
return result.iterateNext(); | |
} else { | |
return undefined; | |
} | |
}; | |
this.addFilter(filter); | |
return this; | |
}; | |
this.evaluate = function (doc) { | |
var it = doc.evaluate(this._xpath, doc, null, XPathResult.ANY_TYPE, null); | |
var a = new Array(); | |
var x; | |
while (x = it.iterateNext()) { a.push(x); } | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._Xpath.prototype = new FW._StringMagic(); | |
FW.detectWeb = function (doc, url) { | |
for (var i in FW._scrapers) { | |
var scraper = FW._scrapers[i]; | |
var itemType = scraper.evaluate('itemType', doc, url); | |
if (!scraper.detect) { | |
return itemType; | |
} else { | |
var v = scraper.evaluate('detect', doc, url); | |
if (v.length > 0 && v[0]) { | |
return itemType; | |
} | |
} | |
} | |
return undefined; | |
}; | |
FW.getScraper = function (doc, url) { | |
var itemType = FW.detectWeb(doc, url); | |
return FW._scrapers.filter(function(s) { | |
return (s.evaluate('itemType', doc, url) == itemType) | |
&& (s.evaluate('detect', doc, url)) | |
})[0]; | |
}; | |
FW.doWeb = function (doc, url) { | |
Zotero.debug("Entering FW.doWeb"); | |
var scraper = FW.getScraper(doc, url); | |
var items = scraper.makeItems(doc, url); | |
for (var i in items) { | |
scraper.callHook('scraperDone', items[i], doc, url); | |
items[i].complete(); | |
} | |
Zotero.debug("Leaving FW.doWeb"); | |
}; | |
/* End generic code */ | |
/*Examples: | |
Individual item | |
http://www.voxeu.org/index.php?q=node/6258 | |
Search results | |
http://www.voxeu.org/index.php?q=search/node/eichengreen */ | |
function detectWeb(doc, url) { return FW.detectWeb(doc, url); } | |
function doWeb(doc, url) { return FW.doWeb(doc, url); } | |
/** Articles */ | |
FW.Scraper({ | |
itemType : 'blogPost', | |
detect : FW.Xpath('//div[@class="terms"]'), | |
title : FW.Xpath('//div[@id="main"]/div[@id="squeeze"]/h1').text().trim(), | |
attachments : FW.Url().makeAttachment("text/html", "voxEU snapshot"), | |
creators : FW.Xpath('//table[@class="layouttable"]/tbody/*/td/p/a ').text().cleanAuthor("author"), | |
abstractNote : FW.Xpath('//table[@class="layouttable"]/tbody/tr/td/div/em').text(), | |
date : FW.Xpath('//table[@class="layouttable"]/tbody/*/td/p/text()[last()] ').text(), | |
publicationTitle : "VoxEU.org", | |
tags : FW.Xpath('//div[@class="terms"]//li').text() | |
}); | |
/** Search results */ | |
FW.MultiScraper({ | |
itemType : "multiple", | |
detect : FW.Xpath('//div[@class="content"]/dl[contains(@class, "search-results")]'), | |
titles : FW.Xpath('//div[@class="content"]/dl/dt[@class="title"]/a').text(), | |
urls : FW.Xpath('//div[@class="content"]/dl/dt[@class="title"]/a').key('href').text() | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment