Created
May 12, 2011 21:49
-
-
Save adam3smith/969535 to your computer and use it in GitHub Desktop.
Washington Monthly.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID":"e623eec7-ad54-4201-b709-654bf3fd7f70", | |
"label":"Washington Monthly", | |
"creator":"Sebastian Karcher", | |
"target":"^https?://www\\.washingtonmonthly\\.com", | |
"minVersion":"1.0", | |
"maxVersion":"", | |
"priority":100, | |
"inRepository":"1", | |
"translatorType":4, | |
"lastUpdated":"2011-05-13 01:21:01" | |
} | |
/** | |
Copyright (c) 2010, Erik Hetzner | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
function flatten(a) { | |
var retval = new Array(); | |
for (var i in a) { | |
var entry = a[i]; | |
if (entry instanceof Array) { | |
retval = retval.concat(flatten(entry)); | |
} else { | |
retval.push(entry); | |
} | |
} | |
return retval; | |
} | |
/* Generic code */ | |
var FW = { | |
_scrapers : new Array() | |
}; | |
FW._Base = function () { | |
this.callHook = function (hookName, item, doc, url) { | |
if (typeof this['hooks'] === 'object') { | |
var hook = this['hooks'][hookName]; | |
if (typeof hook === 'function') { | |
hook(item, doc, url); | |
} | |
} | |
}; | |
this.evaluateThing = function(val, doc, url) { | |
var valtype = typeof val; | |
if (valtype === 'string') { | |
return val; | |
} else if (valtype === 'object') { | |
if (val instanceof Array) { | |
/* map over each array val */ | |
/* this.evaluate gets out of scope */ | |
var parentEval = this.evaluateThing; | |
var retval = val.map ( function(i) { return parentEval (i, doc, url); } ); | |
return flatten(retval); | |
} else { | |
return val.evaluate(doc, url); | |
} | |
} else if (valtype === 'function') { | |
return val(doc, url); | |
} else { | |
return undefined; | |
} | |
}; | |
this.evaluate = function (key, doc, url) { | |
return this.evaluateThing(this[key], doc, url); | |
}; | |
this.evaluateSub = function (key, key2, doc, url) { | |
return this.evaluateThing(this[key][key2], doc, url); | |
}; | |
}; | |
FW.Scraper = function (init) { | |
FW._scrapers.push(new FW._Scraper(init)); | |
}; | |
FW._Scraper = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._singleFieldNames = [ | |
"abstractNote", | |
"applicationNumber", | |
"archive", | |
"archiveLocation", | |
"artworkMedium", | |
"artworkSize", | |
"assignee", | |
"audioFileType", | |
"audioRecordingType", | |
"billNumber", | |
"blogTitle", | |
"bookTitle", | |
"callNumber", | |
"caseName", | |
"code", | |
"codeNumber", | |
"codePages", | |
"codeVolume", | |
"committee", | |
"company", | |
"conferenceName", | |
"country", | |
"court", | |
"date", | |
"dateDecided", | |
"dateEnacted", | |
"dictionaryTitle", | |
"distributor", | |
"docketNumber", | |
"documentNumber", | |
"DOI", | |
"edition", | |
"encyclopediaTitle", | |
"episodeNumber", | |
"extra", | |
"filingDate", | |
"firstPage", | |
"forumTitle", | |
"genre", | |
"history", | |
"institution", | |
"interviewMedium", | |
"ISBN", | |
"ISSN", | |
"issue", | |
"issueDate", | |
"issuingAuthority", | |
"journalAbbreviation", | |
"label", | |
"language", | |
"legalStatus", | |
"legislativeBody", | |
"letterType", | |
"libraryCatalog", | |
"manuscriptType", | |
"mapType", | |
"medium", | |
"meetingName", | |
"nameOfAct", | |
"network", | |
"number", | |
"numberOfVolumes", | |
"numPages", | |
"pages", | |
"patentNumber", | |
"place", | |
"postType", | |
"presentationType", | |
"priorityNumbers", | |
"proceedingsTitle", | |
"programTitle", | |
"programmingLanguage", | |
"publicLawNumber", | |
"publicationTitle", | |
"publisher", | |
"references", | |
"reportNumber", | |
"reportType", | |
"reporter", | |
"reporterVolume", | |
"rights", | |
"runningTime", | |
"scale", | |
"section", | |
"series", | |
"seriesNumber", | |
"seriesText", | |
"seriesTitle", | |
"session", | |
"shortTitle", | |
"studio", | |
"subject", | |
"system", | |
"thesisType", | |
"title", | |
"type", | |
"university", | |
"url", | |
"version", | |
"videoRecordingType", | |
"volume", | |
"websiteTitle", | |
"websiteType" ]; | |
this.makeItems = function (doc, url, ignore, ret) { | |
var item = new Zotero.Item(this.itemType); | |
item.url = url; | |
for (var i in this._singleFieldNames) { | |
var field = this._singleFieldNames[i]; | |
if (this[field]) { | |
var fieldVal = this.evaluate(field, doc, url); | |
if (fieldVal instanceof Array) { | |
item[field] = fieldVal[0]; | |
} else { | |
item[field] = fieldVal; | |
} | |
} | |
} | |
var multiFields = ["creators", "tags"]; | |
for (var j in multiFields) { | |
var key = multiFields[j]; | |
var val = this.evaluate(key, doc, url); | |
if (val) { | |
for (var k in val) { | |
item[key].push(val[k]); | |
} | |
} | |
} | |
/* attachments */ | |
if (typeof this["attachments"] === 'object') { | |
var attachUrls = this.evaluateSub("attachments", "urls", doc, url); | |
var attachTitles = this.evaluateSub("attachments", "titles", doc, url); | |
var attachTypes = this.evaluateSub("attachments", "types", doc, url); | |
var typesIsArray = (attachTypes instanceof Array); | |
var titlesIsArray = (attachTitles instanceof Array); | |
for (var k in attachUrls) { | |
var attachUrl = attachUrls[k]; | |
var attachType; | |
var attachTitle; | |
if (typesIsArray) { attachType = attachTypes[k]; } | |
else { attachType = attachTypes; } | |
if (titlesIsArray) { attachTitle = attachTitles[k]; } | |
else { attachTitle = attachTitles; } | |
item["attachments"].push({ 'url': attachUrl, | |
'title': attachTitle, | |
'type': attachType }); | |
} | |
} | |
ret([item]); | |
}; | |
}; | |
FW._Scraper.prototype = new FW._Base; | |
FW.MultiScraper = function (init) { | |
FW._scrapers.push(new FW._MultiScraper(init)); | |
}; | |
FW._MultiScraper = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._mkSelectItems = function(titles, urls) { | |
var items = new Object; | |
for (var i in titles) { | |
items[urls[i]] = titles[i]; | |
} | |
return items; | |
}; | |
this._selectItems = function(titles, urls) { | |
var items = new Array(); | |
for (var j in Zotero.selectItems(this._mkSelectItems(titles, urls))) { | |
items.push(j); | |
} | |
return items; | |
}; | |
this._mkAttachments = function(doc, url, urls) { | |
var attachmentsArray = this.evaluate('attachments', doc, url); | |
var attachmentsDict = new Object(); | |
if (attachmentsArray) { | |
for (var i in urls) { | |
attachmentsDict[urls[i]] = attachmentsArray[i]; | |
} | |
} | |
return attachmentsDict; | |
}; | |
this.makeItems = function(doc, url, ignore, ret) { | |
Zotero.debug("Entering MultiScraper.makeItems"); | |
if (this.beforeFilter) { | |
var newurl = this.beforeFilter(doc, url); | |
if (newurl != url) { | |
var items; | |
Zotero.Utilities.processDocument(newurl, function (doc) { | |
items = this.makeItems(doc); | |
}, | |
function () { ret(items); }); | |
return; | |
} | |
} | |
var titles = this.evaluate('titles', doc, url); | |
var urls = this.evaluate('urls', doc, url); | |
var itemsToUse = this._selectItems(titles, urls); | |
var attachments = this._mkAttachments(doc, url, urls); | |
if(!itemsToUse) { | |
ret([]); | |
} else { | |
var items = []; | |
var parentItemTrans = this.itemTrans; | |
var handleDoc = function (doc1) { | |
var url1 = doc1.documentURI; | |
var itemTrans = parentItemTrans; | |
if (itemTrans === undefined) { | |
itemTrans = FW.getScraper(doc1, url1); | |
} | |
if (itemTrans === undefined) { | |
/* nothing to do */ | |
} else { | |
itemTrans.makeItems(doc1, url1, attachments[url1], | |
function (items1) { | |
items.push(items1[0]); | |
}); | |
} | |
/* never returns */ | |
}; | |
Zotero.Utilities.processDocuments(itemsToUse, | |
handleDoc, | |
function () { | |
ret(items); | |
}); | |
} | |
}; | |
}; | |
FW._MultiScraper.prototype = new FW._Base; | |
FW.DelegateTranslator = function (init) { | |
return new FW._DelegateTranslator(init); | |
}; | |
FW._DelegateTranslator = function (init) { | |
for (x in init) { | |
this[x] = init[x]; | |
} | |
this._translator = Zotero.loadTranslator(this.translatorType); | |
this._translator.setTranslator(this.translatorId); | |
this.makeItems = function(doc, url, attachments, ret) { | |
Zotero.debug("Entering DelegateTranslator.makeItems"); | |
var tmpItem; | |
Zotero.Utilities.HTTP.doGet(url, | |
function (text) { | |
this._translator.setHandler("itemDone", function(obj, item) { | |
tmpItem = item; | |
/* this does not seem to be working */ | |
if (attachments) { item.attachments = attachments; } | |
}); | |
this._translator.setString(text); | |
this._translator.translate(); | |
}, | |
function () { | |
ret([tmpItem]); | |
}); | |
}; | |
}; | |
FW.DelegateTranslator.prototype = new FW._Scraper; | |
FW._StringMagic = function () { | |
this._filters = new Array(); | |
this.addFilter = function(filter) { | |
this._filters.push(filter); | |
return this; | |
}; | |
this.split = function(re) { | |
return this.addFilter(function(s) { | |
return s.split(re).filter(function(e) { return (e != ""); }); | |
}); | |
}; | |
this.replace = function(s1, s2, flags) { | |
return this.addFilter(function(s) { | |
if (s.match(s1)) { | |
return s.replace(s1, s2, flags); | |
} else { | |
return s; | |
} | |
}); | |
}; | |
this.prepend = function(prefix) { | |
return this.replace(/^/, prefix); | |
}; | |
this.append = function(postfix) { | |
return this.replace(/$/, postfix); | |
}; | |
this.remove = function(toStrip, flags) { | |
return this.replace(toStrip, '', flags); | |
}; | |
this.trim = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.trim(s); }); | |
}; | |
this.trimInternal = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.trimInternal(s); }); | |
}; | |
this.match = function(re, group) { | |
if (!group) group = 0; | |
return this.addFilter(function(s) { | |
var m = s.match(re); | |
if (m === undefined) { return undefined; } | |
else { return m[group]; } | |
}); | |
}; | |
this.cleanAuthor = function(type, useComma) { | |
return this.addFilter(function(s) { return Zotero.Utilities.cleanAuthor(s, type, useComma); }); | |
}; | |
this.key = function(field) { | |
return this.addFilter(function(n) { return n[field]; }); | |
}; | |
this.capitalizeTitle = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.capitalizeTitle(s); }); | |
}; | |
this.unescapeHTML = function() { | |
return this.addFilter(function(s) { return Zotero.Utilities.unescapeHTML(s); }); | |
}; | |
this.unescape = function() { | |
return this.addFilter(function(s) { return unescape(s); }); | |
}; | |
this.makeAttachment = function(type, title) { | |
var filter = function(url) { | |
if (url) { | |
return { url : url, | |
type : type, | |
title : title }; | |
} else { | |
return undefined; | |
} | |
}; | |
return this.addFilter(filter); | |
}; | |
this._applyFilters = function(a, doc1) { | |
Zotero.debug("Entering StringMagic._applyFilters"); | |
for (i in this._filters) { | |
a = flatten(a); | |
/* remove undefined or null array entries */ | |
a = a.filter(function(x) { return ((x !== undefined) && (x !== null)); }); | |
for (var j = 0 ; j < a.length ; j++) { | |
try { | |
if ((a[j] === undefined) || (a[j] === null)) { continue; } | |
else { a[j] = this._filters[i](a[j], doc1); } | |
} catch (x) { | |
a[j] = undefined; | |
Zotero.debug("Caught exception " + x + "on filter: " + this._filters[i]); | |
} | |
} | |
/* remove undefined or null array entries */ | |
/* need this twice because they could have become undefined or null along the way */ | |
a = a.filter(function(x) { return ((x !== undefined) && (x !== null)); }); | |
} | |
return a; | |
}; | |
}; | |
FW.PageText = function () { | |
return new FW._PageText(); | |
}; | |
FW._PageText = function() { | |
this._filters = new Array(); | |
this.evaluate = function (doc) { | |
var a = [doc.documentElement.innerHTML]; | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._PageText.prototype = new FW._StringMagic(); | |
FW.Url = function () { return new FW._Url(); }; | |
FW._Url = function () { | |
this._filters = new Array(); | |
this.evaluate = function (doc, url) { | |
var a = [url]; | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._Url.prototype = new FW._StringMagic(); | |
FW.Xpath = function (xpathExpr) { return new FW._Xpath(xpathExpr); }; | |
FW._Xpath = function (_xpath) { | |
this._xpath = _xpath; | |
this._filters = new Array(); | |
this.text = function() { | |
var filter = function(n) { | |
if (typeof n === 'object' && n.textContent) { return n.textContent; } | |
else { return n; } | |
}; | |
this.addFilter(filter); | |
return this; | |
}; | |
this.sub = function(xpath) { | |
var filter = function(n, doc) { | |
var result = doc.evaluate(xpath, n, null, XPathResult.ANY_TYPE, null); | |
if (result) { | |
return result.iterateNext(); | |
} else { | |
return undefined; | |
} | |
}; | |
this.addFilter(filter); | |
return this; | |
}; | |
this.evaluate = function (doc) { | |
var it = doc.evaluate(this._xpath, doc, null, XPathResult.ANY_TYPE, null); | |
var a = new Array(); | |
var x; | |
while (x = it.iterateNext()) { a.push(x); } | |
a = this._applyFilters(a, doc); | |
if (a.length == 0) { return false; } | |
else { return a; } | |
}; | |
}; | |
FW._Xpath.prototype = new FW._StringMagic(); | |
FW.detectWeb = function (doc, url) { | |
for (var i in FW._scrapers) { | |
var scraper = FW._scrapers[i]; | |
var itemType = scraper.evaluate('itemType', doc, url); | |
if (!scraper.detect) { | |
return itemType; | |
} else { | |
var v = scraper.evaluate('detect', doc, url); | |
if (v.length > 0 && v[0]) { | |
return itemType; | |
} | |
} | |
} | |
return undefined; | |
}; | |
FW.getScraper = function (doc, url) { | |
var itemType = FW.detectWeb(doc, url); | |
return FW._scrapers.filter(function(s) { | |
return (s.evaluate('itemType', doc, url) == itemType) | |
&& (s.evaluate('detect', doc, url)) | |
})[0]; | |
}; | |
FW.doWeb = function (doc, url) { | |
Zotero.debug("Entering FW.doWeb"); | |
var scraper = FW.getScraper(doc, url); | |
scraper.makeItems(doc, url, [], | |
function(items) { | |
for (var i in items) { | |
scraper.callHook('scraperDone', items[i], doc, url); | |
items[i].complete(); | |
} | |
Zotero.done(); | |
}); | |
Zotero.wait(); | |
Zotero.debug("Leaving FW.doWeb"); | |
}; | |
/* End generic code */ | |
function detectWeb(doc, url) { return FW.detectWeb(doc, url); } | |
function doWeb(doc, url) { return FW.doWeb(doc, url); } | |
/** Articles */ | |
FW.Scraper({ | |
itemType : 'magazineArticle', | |
detect : FW.Xpath('//body[@class="article"]//div[@class="article"]'), | |
title : FW.Xpath('//div[@class="article"]/a[@class="headline-article"]').text(). | |
trim(), | |
abstractNote : FW.Xpath('//div[@class="article"]/p[@class="subhead"]').text(). | |
trim(), | |
attachments : { | |
urls : [FW.Url(), | |
FW.Xpath('//div[@class="pagination"]/a[not(@class)]').key('href').text()], | |
titles : [FW.Url().match(/[0-9]+$/).prepend("Washington Monthly Snapshot pg. "), | |
FW.Xpath('//div[@class="pagination"]/a'). | |
key('href').text(). | |
match(/[0-9]+$/). | |
prepend("Washington Monthly Snapshot pg. ")], | |
types : "text/html" }, | |
creators : FW.Xpath('//div[@class="article"]/p[@class="author"]').text(). | |
remove(/By/). | |
cleanAuthor("author"), | |
date : FW.Xpath('//div[@class="article"]/span[@class="date"]').text(), | |
publicationTitle : "The Washington Monthly" | |
}); | |
/** Blog Posts */ | |
FW.Scraper({ | |
itemType : 'blogPost', | |
detect : FW.Xpath('//body[@class="blog"]'), | |
title : FW.Xpath('//div[@class="article"]/a[@class="headline"]').text(). | |
trim(), | |
abstractNote : FW.Xpath('//div[@class="article"]/p[@class="subhead"]').text(). | |
trim(), | |
attachments : { | |
urls : FW.Url(), | |
titles : "Washington Monthly Snapshot", | |
types : "text/html" }, | |
creators : FW.Xpath('//div[@class="article"]/p[@class="author"]').text(). | |
remove(/By/). | |
cleanAuthor("author"), | |
date : FW.Xpath('//div[@class="article"]/span[@class="date"]').text(), | |
publicationTitle : FW.Xpath('//div[@id="content"]/h5').text(). | |
prepend('The Washington Monthly - ') | |
}); | |
/** Articles Old */ | |
FW.Scraper({ | |
itemType : 'magazineArticle', | |
detect : FW.Xpath('//span[@class="HED"]'), | |
title : FW.Xpath('//span[@class="HED"]').text().trim(), | |
abstractNote : FW.Xpath('//span[@class="SubHed"]').text().trim(), | |
attachments : { | |
urls: FW.Url(), | |
titles: "Washington Monthly Snapshot", | |
types: "text/html" | |
}, | |
creators : FW.Xpath('//span[@class="Author"]/a').text().cleanAuthor("author"), | |
date : FW.Xpath('//td[@align="right"]/p[@class="TXT9"]').text().remove(/Washington Monthly,/).trim(), | |
publicationTitle : "The Washington Monthly" | |
}); | |
/** Blog Posts - old*/ | |
FW.Scraper({ | |
itemType : 'blogPost', | |
detect : FW.Xpath('//div[@class="blog"]'), | |
title : FW.Xpath('//div[@class="blogbody"]/*/p/child::*').text().remove(/\.\.\.\./).capitalizeTitle().trim(), | |
attachments : { | |
urls: FW.Url(), | |
titles: "Washington Monthly Snapshot", | |
types: "text/html" | |
}, | |
creators : FW.Xpath('//div[@class="blogbody"]/*/div[@class="header"]/span').text().cleanAuthor("author"), | |
date : FW.Xpath('//div[@class="blog"]/div[@class="date"]').text(), | |
publicationTitle : FW.Xpath('//td/div[@align="left"]/img').key('alt').text().prepend('The Washington Monthly - ') | |
}); | |
/** Search results */ | |
FW.MultiScraper({ | |
itemType : "multiple", | |
detect : FW.Xpath('//div[@id="content"]/h4[@class="search-results"]'), | |
titles : FW.Xpath('//div[@class="gs-title"]/a').text(), | |
urls : FW.Xpath('//div[@class="gs-title"]/a').key('href').text() | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment