-
-
Save avram/499358 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"translatorID":"207f4aad-b604-43ef-a7f5-3e6229aade9f", | |
"label":"New Zealand Herald", | |
"creator":"Sopheak Hean (University of Waikato, Faculty of Education)", | |
"target":"www.nzherald.co.nz", | |
"minVersion":"1.0", | |
"maxVersion":"", | |
"priority":100, | |
"inRepository":"1", | |
"translatorType":4, | |
"lastUpdated":"2010-07-30 09:26:09"} | |
function detectWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == "x" ) return namespace; else return null; | |
} : null; | |
/* If the address bar has /news in it then it's a newspapers article*/ | |
if (doc.title.indexOf("Search Result") !=-1){ | |
return "multiple"; | |
} else if (doc.location.href.indexOf("/news") !=-1){ | |
return "newspaperArticle"; | |
} | |
} | |
function associateData (newItem, items, field, zoteroField) { | |
if (items[field]){ | |
newItem[zoteroField] = items[field]; | |
} | |
} | |
function scrape(doc, url){ | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articleLanguage = "English"; | |
var newItem = new Zotero.Item('newspaperArticle'); | |
newItem.url = doc.location.href; | |
Zotero.Utilities.HTTP.doGet(newItem.url, function(text) { | |
newItem.title = "No Title Found"; | |
newItem.publicationTitle = "New Zealand Herald"; | |
//Get title of the news via xpath | |
var myXPath = '//h1'; | |
var myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
var headers; | |
var items = new Object(); | |
var authorsTemp; | |
var blankCell; | |
var contents; | |
var authorArray = new Array(); | |
/* | |
//Get authors of the article | |
Remove "By " then replace "and " with ", " | |
Put the string into an array then split the array and loop all authors then push author to Zotero. Possible with more than 1 author on an article. | |
*/ | |
var aut = /<span class=\"credits\">(.*)/g; | |
if (text.match(aut)){ | |
var authorXPath = '//span[@class="credits"]'; | |
var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\bBy\W+/g, ''); | |
if (authorXPathObject.match(/\W\band\W+/g)){ | |
authorTemp = authorXPathObject.replace(/\W\band\W+/g, ', '); | |
authorArray = authorTemp.split(", "); | |
} else if (!authorXPathObject.match(/\W\band\W+/g)){ | |
authorArray = authorXPathObject; | |
} | |
if( authorArray instanceof Array ) { | |
for (var i in authorArray){ | |
var author; | |
author = authorArray[i]; | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author")); | |
} | |
} | |
else { | |
if (authorXPathObject.match(/\W\bof\W+/g)){ | |
authorTemp = authorXPathObject.replace (/\W\bof\W(.*)/g, ''); | |
authorArray = authorTemp; | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorTemp, "author")); | |
} else { | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorArray, "author")); | |
} | |
} | |
} else{ | |
var authorname = "Unknown "; | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(authorname, "author")); | |
} | |
// | |
//date-Year | |
var dateXPath = '//div[@class="tools"]/span'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
//newItem.date = dateXPathObject; | |
//If the original Xpath1 is equal to Updated then go to XPath2 | |
if ((dateXPathObject =="Updated")|| (dateXPathObject =="New")){ | |
var dateXPath = '//div[@class="tools"]/span[2]'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
newItem.date = dateXPathObject ; | |
} | |
else{ //great found the date just push it to Zotero. | |
var dateXPath = '//div[@class="tools"]/span'; | |
var dateXPathObject = doc.evaluate(dateXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.replace(/\d{1,2}:\d{1,2} (AM|PM) (\w)+ /g, ''); | |
newItem.date = dateXPathObject ; | |
} | |
//Get Section of the news | |
var sectionXPath = '//div[@class="sectionHeader"]/span/a[1]'; | |
var sectionXPathObject = doc.evaluate(sectionXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = sectionXPathObject; | |
//Get news title | |
headers =myXPathObject; | |
newItem.title = headers; | |
newItem.language= articleLanguage; | |
//grab abstract from meta data | |
var a= /meta name=\"description\" content=\"([^&]*)/; | |
newItem.abstractNote = text.match(a)[1]; | |
newItem.complete(); | |
Zotero.done(); | |
}, function() {}); | |
/* These doing nothing but leaving it here just in case | |
associateData (newItem, items, "Language:", "language"); | |
associateData (newItem, items, "Section:", "section"); | |
associateData (newItem, items, "Abstract:", "abstract"); | |
associateData (newItem, items, "Author:", "author"); | |
*/ | |
} | |
function doWeb(doc, url){ | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix){ | |
if (prefix =='x') | |
return namespace; else return null; | |
} :null; | |
var articles = new Array(); | |
var items = new Object(); | |
var nextTitle; | |
if (detectWeb(doc, url) == "multiple"){ | |
var titles = doc.evaluate('//p[@class="g"]/a', doc, nsResolver, XpathResult.ANY_TYPE, null); | |
while (nextTitle = titles.iterateNext()){ | |
items[nextTitle.href] = nextTitle.textContent; | |
} | |
items= Zotero.selectItems(items); | |
for (var i in items){ | |
articles.push(i); | |
} | |
} else if (detectWeb(doc,url) =="newspaperArticle"){ | |
articles = [url]; | |
} | |
Zotero.debug(articles); | |
Zotero.Utilities.HTTP.doPost(articles, "", function(text) { | |
for (var i = 0 ; i < articles.length ; i++) { | |
scrape(articles[i]); | |
} | |
}); | |
//Zotero.Util only works when scrape function is declared | |
Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();}); | |
Zotero.wait(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment