fbennett · May 6, 2011 02:47 · avram · May 6, 2011 · avram · May 6, 2011
diff --git a/The Boston Globe.js b/The Boston Globe.js
 {
        "translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
        "label": "The Boston Globe",
        "creator": "Adam Crymble and Frank Bennett",
        "target": "^http://(www|search|articles)\\.boston\\.com/",
        "minVersion": "1.0.0b4.r5",
        "maxVersion": "",
        "priority": 100,
        "inRepository": "1",
        "translatorType": 4,
        "lastUpdated": "2011-05-06 12:08:09"
 }

 /*
 * Sample URLs
 *
 * [Original request -- uncommon page format, no embedded metadata of any kind]
 * http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
 *
 * [More common page formats, marginally reliable metadata in a comment block]
 * http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
 * http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
 * http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
 */

 function detectWeb(doc, url) {
 	if (url.match("search.boston.com")) {
 		return "multiple";
 	} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
 		return "newspaperArticle";
 	} 
 }

 //Boston Globe and Boston.com Translator. Original code by Adam Crymble
 // Rewritten by Frank Bennett, 2011

 function sniffComment (elem) {
 	if (!elem) {
 		return elem;
 	}
 	for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
 		if (elem.childNodes[i].nodeName === "#comment") {
 			return elem.childNodes[i].nodeValue;
 		}
 	}
 	return false;
 }

 function findMagicComment (doc) {
 	var hideMeElems = doc.getElementsByClassName("hideMe");
 	for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
 		var elem = hideMeElems.item(i);
 		var sniff = sniffComment(elem);
 		if (sniff) {
 			return sniff;
 		}
 	}
 	var contentElem = doc.getElementById("content");
 	return sniffComment(contentElem);
 }

 function findAuthorString (doc, newItem) {
 	var authors = "";
 	var bylineElem = false;
 	var bylineElems = doc.getElementsByClassName("byline");
 	if (bylineElems.length) {
 		bylineElem = bylineElems.item(0);
 	}
 	if (!bylineElem) {
 		var bylineElem = doc.getElementById('byline');
 	}
 	if (bylineElem) {
 		authors = bylineElem.textContent;
 		authors = authors.replace("\n", " ", "g");
 		if (authors.match(/[Pp]osted\s+by\s+/)) {
 			newItem.itemType = "blogPost";
 		}
 		authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1");
 	}
 	return authors;
 }

 function scrape (doc, url) {
 	// The site content is pretty chaotic, we do our best.

 	// There are two independent blocks set-and-save blocks
 	// below.

 	// Many pages seem to have metadata embedded in a comment
 	// The date and headline info look reliable, but
 	// the byline is a disaster, to be used only
 	// if absolutely necessary.
 	var magicComment = findMagicComment(doc);
 	if (magicComment) {
 		// Blind acceptance
 		var newItem =new Zotero.Item("newspaperArticle");
 		newItem.publicationTitle = "Boston.com";
 		// URL	
 		newItem.url = doc.location.href;
 		// Attachment
 		newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
 		// Now try to get some citation details (go ahead, try)
 		var info = magicComment.replace('\n','','g');
 		newItem.title = info.replace(/.*<headline>(.*)<\/headline>.*/,"$1");
 		newItem.date = info.replace(/.*<date>(.*)<\/date>.*/,"$1");
 		var authors = findAuthorString(doc, newItem);
 		if (!authors) {
 			var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1");
 			if (authors.toLowerCase() === authors) {
 				authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1");
 				var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/);
 				if (m) {
 					authors = m[1];
 				} else {
 					authors = "";
 				}
 			}
 		}
 		authors = authors.split(/,*\s+and\s+/);
 		authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
 		authors = authors.join(", ");
 		authors = authors.split(/,\s+/);
 		for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
 			var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
 			if (author.lastName) {
 				newItem.creators.push(author);
 			}
 		}
 		newItem.complete();
 	}


 	// Information block
 	var infoElem = doc.getElementById("mod-article-byline");
 	if (infoElem) {
 		var newItem = new Zotero.Item("newspaperArticle");
 		newItem.publicationTitle = "Boston.com";
 		// URL	
 		newItem.url = doc.location.href;
 		newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});

 		// Date
 		var dateElem = infoElem.getElementsByClassName('pubdate');
 		if (dateElem.length) {
 			newItem.date = dateElem.textContent;
 		}

 		// Authors
 		for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
 			var node = infoElem.childNodes.item(i);
 			if (node.nodeName === 'SPAN') {
 				if ('By' === node.textContent.slice(0,2)) {
 					
 					var authors = node.textContent.slice(3);
 					authors = authors.split(/(?:, |,*\s+and\s+)/);
 					for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
 						var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
 						newItem.creators.push(author);
 					}
 				}
 			}
 		}
 		
 		// Title	
 		var headerElem = doc.getElementById('mod-article-header');
 		if (headerElem) {
 			var h = headerElem.getElementsByTagName('h1');
 			if (h.length) {
 				newItem.title = h[0].textContent;
 			}
 		}
 		newItem.complete();
 	}
 }


 function doWeb (doc, url) {
 	var namespace = doc.documentElement.namespaceURI;
 	var nsResolver = namespace ? function(prefix) {
 	}: null;
 	
 	var uris= new Array();

 	if (detectWeb(doc, url) == "multiple") {
 		var items = new Object();
 		var result =  doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
 		var elmt = result.iterateNext();
 		while (elmt) {
 			//items.push(elmt.href);
 			items[elmt.href] = elmt.textContent;
 			elmt = result.iterateNext();
 		}
 		
 		items = Zotero.selectItems(items);
 		
 		if (!items) {
 			return true;
 		}
 		
 		for (var i in items) {
 			uris.push(i);
 		}
 		Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
 		Zotero.wait();
 	} else {
 		scrape(doc, url);
 	}
 }
	{
	"translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
	"label": "The Boston Globe",
	"creator": "Adam Crymble and Frank Bennett",
	"target": "^http://(www\|search\|articles)\\.boston\\.com/",
	"minVersion": "1.0.0b4.r5",
	"maxVersion": "",
	"priority": 100,
	"inRepository": "1",
	"translatorType": 4,
	"lastUpdated": "2011-05-06 12:08:09"
	}

	/*
	* Sample URLs
	*
	* [Original request -- uncommon page format, no embedded metadata of any kind]
	* http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
	*
	* [More common page formats, marginally reliable metadata in a comment block]
	* http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
	* http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
	* http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
	*/

	function detectWeb(doc, url) {
	if (url.match("search.boston.com")) {
	return "multiple";
	} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/\|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
	return "newspaperArticle";
	}
	}

	//Boston Globe and Boston.com Translator. Original code by Adam Crymble
	// Rewritten by Frank Bennett, 2011

	function sniffComment (elem) {
	if (!elem) {
	return elem;
	}
	for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
	if (elem.childNodes[i].nodeName === "#comment") {
	return elem.childNodes[i].nodeValue;
	}
	}
	return false;
	}

	function findMagicComment (doc) {
	var hideMeElems = doc.getElementsByClassName("hideMe");
	for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
	var elem = hideMeElems.item(i);
	var sniff = sniffComment(elem);
	if (sniff) {
	return sniff;
	}
	}
	var contentElem = doc.getElementById("content");
	return sniffComment(contentElem);
	}

	function findAuthorString (doc, newItem) {
	var authors = "";
	var bylineElem = false;
	var bylineElems = doc.getElementsByClassName("byline");
	if (bylineElems.length) {
	bylineElem = bylineElems.item(0);
	}
	if (!bylineElem) {
	var bylineElem = doc.getElementById('byline');
	}
	if (bylineElem) {
	authors = bylineElem.textContent;
	authors = authors.replace("\n", " ", "g");
	if (authors.match(/[Pp]osted\s+by\s+/)) {
	newItem.itemType = "blogPost";
	}
	authors = authors.replace(/^\s(?:[Bb]y\|[Pp]osted\s+by)\s+(.)/, "$1");
	}
	return authors;
	}

	function scrape (doc, url) {
	// The site content is pretty chaotic, we do our best.

	// There are two independent blocks set-and-save blocks
	// below.

	// Many pages seem to have metadata embedded in a comment
	// The date and headline info look reliable, but
	// the byline is a disaster, to be used only
	// if absolutely necessary.
	var magicComment = findMagicComment(doc);
	if (magicComment) {
	// Blind acceptance
	var newItem =new Zotero.Item("newspaperArticle");
	newItem.publicationTitle = "Boston.com";
	// URL
	newItem.url = doc.location.href;
	// Attachment
	newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
	// Now try to get some citation details (go ahead, try)
	var info = magicComment.replace('\n','','g');
	newItem.title = info.replace(/.<headline>(.)<\/headline>.*/,"$1");
	newItem.date = info.replace(/.<date>(.)<\/date>.*/,"$1");
	var authors = findAuthorString(doc, newItem);
	if (!authors) {
	var authors = info.replace(/.<byline>(.)<\/byline>.*/,"$1");
	if (authors.toLowerCase() === authors) {
	authors = info.replace(/.<teasetext>(.)<\/teasetext>.*/, "$1");
	var m = authors.match(/^(?:[Bb]y\s+)([^ ,]+)./);
	if (m) {
	authors = m[1];
	} else {
	authors = "";
	}
	}
	}
	authors = authors.split(/,*\s+and\s+/);
	authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
	authors = authors.join(", ");
	authors = authors.split(/,\s+/);
	for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
	var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
	if (author.lastName) {
	newItem.creators.push(author);
	}
	}
	newItem.complete();
	}


	// Information block
	var infoElem = doc.getElementById("mod-article-byline");
	if (infoElem) {
	var newItem = new Zotero.Item("newspaperArticle");
	newItem.publicationTitle = "Boston.com";
	// URL
	newItem.url = doc.location.href;
	newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});

	// Date
	var dateElem = infoElem.getElementsByClassName('pubdate');
	if (dateElem.length) {
	newItem.date = dateElem.textContent;
	}

	// Authors
	for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
	var node = infoElem.childNodes.item(i);
	if (node.nodeName === 'SPAN') {
	if ('By' === node.textContent.slice(0,2)) {

	var authors = node.textContent.slice(3);
	authors = authors.split(/(?:, \|,*\s+and\s+)/);
	for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
	var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
	newItem.creators.push(author);
	}
	}
	}
	}

	// Title
	var headerElem = doc.getElementById('mod-article-header');
	if (headerElem) {
	var h = headerElem.getElementsByTagName('h1');
	if (h.length) {
	newItem.title = h[0].textContent;
	}
	}
	newItem.complete();
	}
	}


	function doWeb (doc, url) {
	var namespace = doc.documentElement.namespaceURI;
	var nsResolver = namespace ? function(prefix) {
	}: null;

	var uris= new Array();

	if (detectWeb(doc, url) == "multiple") {
	var items = new Object();
	var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
	var elmt = result.iterateNext();
	while (elmt) {
	//items.push(elmt.href);
	items[elmt.href] = elmt.textContent;
	elmt = result.iterateNext();
	}

	items = Zotero.selectItems(items);

	if (!items) {
	return true;
	}

	for (var i in items) {
	uris.push(i);
	}
	Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
	Zotero.wait();
	} else {
	scrape(doc, url);
	}
	}