Skip to content

Instantly share code, notes, and snippets.

@fbennett
Created May 6, 2011 02:47
Show Gist options
  • Save fbennett/958363 to your computer and use it in GitHub Desktop.
Save fbennett/958363 to your computer and use it in GitHub Desktop.
The Boston Globe (Zotero translator)
{
"translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
"label": "The Boston Globe",
"creator": "Adam Crymble and Frank Bennett",
"target": "^http://(www|search|articles)\\.boston\\.com/",
"minVersion": "1.0.0b4.r5",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-05-06 12:08:09"
}
/*
* Sample URLs
*
* [Original request -- uncommon page format, no embedded metadata of any kind]
* http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
*
* [More common page formats, marginally reliable metadata in a comment block]
* http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
* http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
* http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
*/
function detectWeb(doc, url) {
if (url.match("search.boston.com")) {
return "multiple";
} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
return "newspaperArticle";
}
}
//Boston Globe and Boston.com Translator. Original code by Adam Crymble
// Rewritten by Frank Bennett, 2011
function sniffComment (elem) {
if (!elem) {
return elem;
}
for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
if (elem.childNodes[i].nodeName === "#comment") {
return elem.childNodes[i].nodeValue;
}
}
return false;
}
function findMagicComment (doc) {
var hideMeElems = doc.getElementsByClassName("hideMe");
for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
var elem = hideMeElems.item(i);
var sniff = sniffComment(elem);
if (sniff) {
return sniff;
}
}
var contentElem = doc.getElementById("content");
return sniffComment(contentElem);
}
function findAuthorString (doc, newItem) {
var authors = "";
var bylineElem = false;
var bylineElems = doc.getElementsByClassName("byline");
if (bylineElems.length) {
bylineElem = bylineElems.item(0);
}
if (!bylineElem) {
var bylineElem = doc.getElementById('byline');
}
if (bylineElem) {
authors = bylineElem.textContent;
authors = authors.replace("\n", " ", "g");
if (authors.match(/[Pp]osted\s+by\s+/)) {
newItem.itemType = "blogPost";
}
authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1");
}
return authors;
}
function scrape (doc, url) {
// The site content is pretty chaotic, we do our best.
// There are two independent blocks set-and-save blocks
// below.
// Many pages seem to have metadata embedded in a comment
// The date and headline info look reliable, but
// the byline is a disaster, to be used only
// if absolutely necessary.
var magicComment = findMagicComment(doc);
if (magicComment) {
// Blind acceptance
var newItem =new Zotero.Item("newspaperArticle");
newItem.publicationTitle = "Boston.com";
// URL
newItem.url = doc.location.href;
// Attachment
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
// Now try to get some citation details (go ahead, try)
var info = magicComment.replace('\n','','g');
newItem.title = info.replace(/.*<headline>(.*)<\/headline>.*/,"$1");
newItem.date = info.replace(/.*<date>(.*)<\/date>.*/,"$1");
var authors = findAuthorString(doc, newItem);
if (!authors) {
var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1");
if (authors.toLowerCase() === authors) {
authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1");
var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/);
if (m) {
authors = m[1];
} else {
authors = "";
}
}
}
authors = authors.split(/,*\s+and\s+/);
authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
authors = authors.join(", ");
authors = authors.split(/,\s+/);
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
if (author.lastName) {
newItem.creators.push(author);
}
}
newItem.complete();
}
// Information block
var infoElem = doc.getElementById("mod-article-byline");
if (infoElem) {
var newItem = new Zotero.Item("newspaperArticle");
newItem.publicationTitle = "Boston.com";
// URL
newItem.url = doc.location.href;
newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
// Date
var dateElem = infoElem.getElementsByClassName('pubdate');
if (dateElem.length) {
newItem.date = dateElem.textContent;
}
// Authors
for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
var node = infoElem.childNodes.item(i);
if (node.nodeName === 'SPAN') {
if ('By' === node.textContent.slice(0,2)) {
var authors = node.textContent.slice(3);
authors = authors.split(/(?:, |,*\s+and\s+)/);
for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
newItem.creators.push(author);
}
}
}
}
// Title
var headerElem = doc.getElementById('mod-article-header');
if (headerElem) {
var h = headerElem.getElementsByTagName('h1');
if (h.length) {
newItem.title = h[0].textContent;
}
}
newItem.complete();
}
}
function doWeb (doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
}: null;
var uris= new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
var elmt = result.iterateNext();
while (elmt) {
//items.push(elmt.href);
items[elmt.href] = elmt.textContent;
elmt = result.iterateNext();
}
items = Zotero.selectItems(items);
if (!items) {
return true;
}
for (var i in items) {
uris.push(i);
}
Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
Zotero.wait();
} else {
scrape(doc, url);
}
}
@avram
Copy link

avram commented May 6, 2011

In the final else clause, just call scrape directly to get the same behavior without an extra request:

Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
Zotero.wait();
} else {
scrape(doc, url);
}

@avram
Copy link

avram commented May 6, 2011

And please escape the full stops in the target regex-- they're not supposed to be wildcards!

@fbennett
Copy link
Author

fbennett commented May 6, 2011

Done, with a few additional tweaks to clean up authors a bit better. Re passing off multples ... I won't hesitate to say "Yes, please!" :)

@avram
Copy link

avram commented May 6, 2011

In the JSON, you need to double-escape the full stops, actually. Scaffold confuses things by doing its own escaping. But I'll take over and finish this up.

@fbennett
Copy link
Author

fbennett commented May 6, 2011

One additional note ... it always reports pages as "newspaperArticle", but some of the content is clearly more bloggish, with postings by readers or by field agents less tightly connected to the paper than a full-time correspondent. There is code in the translator that tries to distinguish by a "Posted by" byline, but that turns up in proper business news articles as well -- there doesn't seem to be any rhyme or reason to its use. Likewise "blogTools" class elements in the pages -- these seem to have been adopted because of they had functionality attractive to that particular segment of the staff, rather than having anything to do with content.

Maybe just dropping that code and committing everything as "newspaperArticle" makes most sense -- users of the translator can easily sort out the item type to suit themselves.

@fbennett
Copy link
Author

fbennett commented May 6, 2011

Double escape done too.

@avram
Copy link

avram commented May 6, 2011

I think we went for treating all NYT content as newspaperArticle-- probably worth doing the same here.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment