Created
March 18, 2011 19:02
-
-
Save mamhoff/876639 to your computer and use it in GitHub Desktop.
site translator for taz.de, Potsdamer Neueste Nachrichten, Der Freitag, Süddeutsche Zeitung, Frankfurter Rundschau, Spiegel Online, Welt Online, Tagesspiegel, Le Monde Diplomatique (de)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "1ab8b9a4-72b5-4ef4-adc8-4956a50718f7", | |
"label": "Der Freitag", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.freitag\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 15:55:00" | |
} | |
/* | |
Der Freitag Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
This site is good, but very, very slow. So when importing multiple Items, be patient! | |
http://www.freitag.de/search?modus=articles&SearchableText=Gaddafi* | |
http://www.freitag.de | |
http://www.freitag.de/guardian-world | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var Freitag_Artikel_XPath = '//div[contains(@class, "artikel_content")]/h2'; | |
var Freitag_multiple_XPath = ".//h3[contains(@class, 'listing')]/a"; | |
if (doc.evaluate(Freitag_Artikel_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ // Diese Zeile verhindert die aus dem Tagesspiegel übernommenen Artikel! | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.evaluate(Freitag_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ // Diese Zeile verhindert die aus dem Tagesspiegel übernommenen Artikel! | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the author and date | |
var meta_XPath = "//div[contains(@class, 'article-heading-meta-left')]" | |
var meta = doc.evaluate(meta_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
meta = meta.split("|"); | |
for (var i in meta) { | |
meta[i] = meta[i].replace(/^\s*|\s*$/g, ''); | |
} | |
newItem.date = meta[1].split(/\s/)[0]; | |
// author | |
var author = meta[2].split(/\sund\s|\su\.\s|\,\s|\//); | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// title | |
var title_XPath = '//title'; | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
title = title.split(/\:|\—/); | |
for (var i in title) { | |
title[i] = title[i].replace(/^\s*|\s*$/g, ''); | |
} | |
newItem.title = "" | |
newItem.title = newItem.title.concat(title[0], ": ", title[1]); | |
newItem.publicationTitle = "Der Freitag"; | |
// Summary | |
var summary_XPath = "//div[@class='artikel_content']/h3"; | |
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.abstractNote = Zotero.Utilities.trim(summary); | |
// no Tags, because Der Freitag doesn't supply any. | |
// Section | |
var section_XPath = "//h1"; | |
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section= section; | |
// Snapshot | |
var printurl_XPath = ".//a[@id='article-drucken']" | |
var printurl= doc.evaluate(printurl_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href; | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"}); | |
newItem.complete() | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate(".//h3[contains(@class, 'listing')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
items[next_title.href] = next_title.textContent; | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "488fe1e0-b7d2-406f-8257-5060418ce9b2", | |
"label": "fr-online.de", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.fr-online\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 15:45:54" | |
} | |
/* | |
fr-online.de Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
Works w/ search and overviews. I had to include the ugly hack stopping non-articles (photo-streams) to make the multiple item import return an error. Test on: | |
http://www.fr-online.de/politik/spezials/wikileaks---die-enthuellungsplattform/-/4882932/4882932/-/index.html | |
http://www.fr-online.de/page/search/fr-online/home/suche/-/1473784/1473784/-/view/asSearch/-/index.html?contextsIds=1472660&docTypes=%22MauArticle,MauGallery,DMBrightcoveVideo,CMDownload,DMMovie,DMEvent,DMVenue%22&offset=5&pageNumber=2&searchMode=SIMPLEALL&sortBy=maupublicationdate&userQuery=Wikileaks | |
http://www.fr-online.de/wirtschaft/krise/-/1471908/1471908/-/index.html | |
http://www.fr-online.de/wirtschaft/krise/portugal-koennte-rettungspaket-benoetigen/-/1471908/8251842/-/index.html | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var FR_article_XPath = ".//div[contains(@class, 'ArticleToolBoxIcons')]"; | |
var FR_multiple_XPath = ".//*[@id='ContainerContent']/div/div[contains(@class, 'Headline2')]/a" | |
if (doc.evaluate(FR_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.location.href.match(/^http\:\/\/www\.fr-online\.de\/.*?page\/search/) ) { | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.evaluate(FR_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function authorCase(author) { // Turns All-Uppercase-Authors to normally cased Authors | |
var words = author.split(/\s/); | |
var authorFixed = ''; | |
for (var i in words) { | |
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase(); | |
authorFixed = authorFixed + words[i] + ' '; | |
} | |
return(authorFixed); | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var FR_article_XPath = ".//div[contains(@class, 'ArticleToolBoxIcons')]"; // this protects against galleries... | |
if (doc.evaluate(FR_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! | |
var title_XPath = '//title' | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.title = title.split("|")[0].replace(/^\s*|\s*$/g, ''); | |
// This is for the author! | |
var author_XPath = '//meta[@name="author"]'; | |
var author= doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
author = author.split(/\,\s|\sund\s/g); | |
if (author[0].match(/Rundschau/)) { // Frankfurter Rundschau is no author. | |
author[0] = ""; | |
} | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
author[i] = Zotero.Utilities.trim(author[i]); | |
author[i] = authorCase(author[i]); | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
//Summary | |
var summary_XPath = '//meta[@name="description"]'; | |
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var summary= doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.abstractNote = Zotero.Utilities.trim(summary); | |
} | |
//Date | |
var date_XPath = ".//div[contains(@class, 'TB_Date')]"; | |
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
date = date.replace(/^\s*Datum\:\s|\s/g, ''); // remove "Datum: " and " " | |
date = date.split("|"); | |
var realdate = ""; | |
realdate = realdate.concat(date[2], "-", date[1], "-", date[0]); | |
newItem.date = realdate; | |
// No Tags. FR does not provide consistently meaningful ones. | |
// Publikation | |
newItem.publicationTitle = "fr-online.de" | |
// Section | |
var section_XPath = '//title' | |
var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
section = section.split(/\||-/); | |
newItem.section = section[1].replace(/^\s*|\s*$/g, ''); | |
// Attachment | |
var printurl = doc.location.href; | |
if (printurl.match("asFirstTeaser")) { | |
printurl = printurl.replace("asFirstTeaser", "printVersion"); | |
} else { | |
printurl = printurl.replace(/\-\/index.html$/, "-/view/printVersion/-/index.html"); | |
} | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"}); | |
newItem.complete() | |
} | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate(".//*[@id='ContainerContentLinie']/div/h2/a|.//*[@id='ContainerContent']/div/div[contains(@class, 'Headline2')]/a|.//*[@id='ContainerContent']/div/div/div[contains(@class, 'link_article')]/a|.//*[@id='Main']/div[contains(@class, '2ColHP')]/div/div/div[contains(@class, 'Headline2')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
// This excludes the videos, whos link terminates in a hash. | |
if (next_title.href.match(/.*html$/)) { | |
items[next_title.href] = next_title.textContent; | |
} | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "530cf18c-e80a-4e67-ae9c-9b8c08591610", | |
"label": "Le monde diplomatique", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.monde-diplomatique\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 16:50:57" | |
} | |
/* | |
Le Monde Diplomatique (de) Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
Works really well. Try here: | |
http://www.monde-diplomatique.de/pm/2011/02/11/a0054.text.name,askexfz1c.n,0 | |
http://www.monde-diplomatique.de/pm/.search?tx=Globalisierung | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
if (url.match(/^http:\/\/www\.monde-diplomatique\.de\/pm\/\d\d\d\d\/\d\d/) ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (url.match(/search/) ) { | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var title_XPath = ".//*[@id='haupt']/div/h3" | |
if (doc.evaluate(title_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.title = Zotero.Utilities.trim(title); | |
// Now for the Author | |
var author_XPath = ".//*[@id='haupt']/div/h4"; | |
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
author = author.replace(/^\s*von\s|\s*$/g, ''); // remove whitespace around the author and the "Von "at the beginning | |
} else { | |
var author = ""; | |
} | |
var author = author.split(" | "); // this seems to work even if there's no | | |
for (var i in author) { | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
// No Tags | |
// Date | |
var date_XPath = ".//*[@id='haupt']/h2" | |
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
date = date.split(" vom ")[1]; | |
newItem.date = date; | |
// Summary | |
var summary_XPath = ".//*[@id='haupt']/div/h5" | |
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.abstractNote = Zotero.Utilities.trim(summary); | |
} | |
newItem.publicationTitle = "Le Monde Diplomatique"; | |
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"}); | |
newItem.complete() | |
} | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate("//*[@id='haupt']/div/p/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
if (next_title.href.match(/^http:\/\/www\.monde-diplomatique\.de\/pm\/\d\d\d\d\/\d\d/) ){ | |
items[next_title.href] = next_title.textContent; | |
} | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "9405db4b-be7f-42ab-86ca-430226be9b35", | |
"label": "Potsdamer Neueste Nachrichten", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.pnn\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 13:42:35" | |
} | |
/* | |
Potsdamer Neueste Nachrichten Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
The articles themselves are quite badly tagged, so that the translator sometimes doesn't capture the summary or the authors. | |
Test it with: | |
http://www.pnn.de/archiv/?type=archiv&phrase=Krise | |
http://www.pnn.de/zeitung/ | |
http://www.pnn.de/zeitung/12.01.2011/ | |
http://www.pnn.de/titelseite/364860/ | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var PNN_Article_XPath = ".//a[contains(@class, 'print')]"; //only articles have a print button. | |
var PNN_Multiple_XPath = ".//ul/li/h2/a" | |
if (doc.evaluate(PNN_Article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.evaluate(PNN_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// Title | |
var title_XPath = '//title' | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
title = title.split("—")[0]; // split at mdash | |
title = title.replace(/\„|\“/g, '"'); // standard quotation marks | |
title = title.replace(/|^\s*|\s*$/, ''); // remove whitespace | |
newItem.title = title; | |
// Summary | |
var summary_XPath = ".//p[contains(@class, 'teaser')]"; | |
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
summary=summary.replace(/\(.*\)/, ''); // No date in the summary. | |
summary=summary.replace(/^\s*|\s*$/g, ''); //remove white space | |
newItem.abstractNote = summary; | |
// Date | |
var date_XPath = "//*[contains(@class, 'teaser')]/span[contains(@class, 'date')]"; | |
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
date = date.replace(/\(|\)|^\s*|\s*$/g, ''); // remove whitespace and braces | |
newItem.date = date; | |
// Authors. Tricky. Actually, horrible. I hope they change their site at some point and this mess can be cleaned up. | |
var temp = new Array(); | |
temp[0] = "" | |
var author_XPath = ".//*[@id='teaser']/p/i"; // Sometimes, the author is in italics in the paragraph. Easy Case, really. | |
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
temp[0] = author; | |
} else { | |
author_XPath = ".//*[@id='teaser']"; // basically, grab the entire article. no other chance. | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
author = author.replace(/\s\s\s*/g, "|"); // replace lots of white space (indicative of a line break / paragraph) | |
author = author.split("|"); | |
// Zotero.debug(author); | |
var author_searchpattern1 = /^Von(.*)/; // These three patterns capture the majority of authors. | |
var author_searchpattern2 = /^Das\sGespräch\sführte(.*)\.$/; | |
var author_searchpattern3 = /^Interview\:\s(.*)Foto:.*/; | |
for (var i in author) { | |
if (temp[0] == "") { | |
if (author[i].match(author_searchpattern1)) { | |
var temp = author[i].match(author_searchpattern1); | |
temp[0] = temp[0].replace(author_searchpattern1, "$1"); | |
} | |
if (author[i].match(author_searchpattern2)) { | |
var temp = author[i].match(author_searchpattern2); | |
temp[0] = temp[0].replace(author_searchpattern2, "$1"); | |
} | |
if (author[i].match(author_searchpattern3)) { | |
var temp = author[i].match(author_searchpattern3); | |
temp[0] = temp[0].replace(author_searchpattern3, "$1"); | |
} | |
} | |
} | |
} | |
var realauthor = temp[0].replace(/^\s*|\s*$/g, ''); | |
realauthor = realauthor.split(/\sund\s|\su\.\s|\,\s/); | |
for (var i in realauthor) { | |
if (realauthor[i].match(/\s/)) { // only names that contain a space! | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(realauthor[i], "author")); | |
} | |
} | |
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"}); | |
newItem.publicationTitle = "Potsdamer Neueste Nachrichten" | |
// section | |
var section_XPath = ".//*[@id='sidebar-left']/ul/li[contains(@class, 'active')]"; | |
var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = section.replace(/^\s*|\s*$/g, ''); | |
newItem.complete(); | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate(".//ul/li/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
items[next_title.href] = next_title.textContent; | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "eef50507-c756-4081-86fd-700ae4ebf22e", | |
"label": "Spiegel Online", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.spiegel\\.de/", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-04-01 11:56:06" | |
} | |
/* | |
Spiegel Online Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
Test with the following URLs: | |
http://www.spiegel.de/suche/index.html?suchbegriff=AKW | |
http://www.spiegel.de/international/search/index.html?suchbegriff=Crisis | |
http://www.spiegel.de/international/topic/german_french_relations/ | |
http://www.spiegel.de/international/europe/0,1518,700530,00.html | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var spiegel_article_XPath = ".//div[@id='spArticleFunctions']"; | |
if (doc.evaluate(spiegel_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/thema/)){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/suche/)){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/search/)){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/topic/)){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title | |
var title_xPath = ".//*[@id='spArticleColumn']/h2|.//*[@id='spArticleColumn ']/h2"; | |
if (doc.evaluate(title_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var title = doc.evaluate(title_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.title = title; | |
} else { | |
var title = doc.evaluate('//title', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
title = title.split(" - ")[0]; | |
newItem.title = title; | |
} | |
// Tags | |
var tags_xPath = '//meta[contains(@name, "keywords")]'; | |
var tags= doc.evaluate(tags_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
tags = tags.split(/,/); | |
tags = tags.slice(5); // The first six 5 Tags are generic or section info. | |
if (tags[0] != "" ) { | |
for (var i in tags) { | |
tags[i] = tags[i].replace(/^\s*|\s*$/g, ''); | |
newItem.tags.push(tags[i]); | |
} | |
} | |
// Author | |
var author_XPath1 = ".//p[contains(@class, 'spAuthor')]"; // Most of the time, the author has its own tag. Easy Case, really. | |
var author_XPath2 = ".//*[@id='spIntroTeaser']/strong/i"; // Sometimes, though, the author is in italics in the teaser. | |
if (doc.evaluate(author_XPath1, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var author = doc.evaluate(author_XPath1, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
Zotero.debug(author); | |
} else if (doc.evaluate(author_XPath2, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var author = doc.evaluate(author_XPath2, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
Zotero.debug(author); | |
} else { | |
author = ""; | |
} | |
author = author.replace(/^\s*By\s|^\s*Von\s|\s*$/g, ''); // remove whitespace around the author and the "Von "at the beginning | |
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){ // Spiegel Online and the Spiegel Archive have different formatting for the author line | |
author = author.split(/\sund\s|\su\.\s|\;\s|\sand\s/); | |
for (var i in author) { | |
author[i] = author[i].replace(/(.*),\s(.*)/, '$2 $1'); | |
} | |
} else { | |
author = author.replace(/,\s|in\s\S*$/, ""); //remove ", location" or "in location" | |
author = author.split(/\sund\s|\su\.\s|\,\s|\sand\s/); | |
} | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// Section | |
var section_xPath = ".//ul[contains(@id, 'spChannel')]/li/ul/li/a[contains(@class, 'spActive')]"; | |
if (doc.evaluate(section_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var section = doc.evaluate(section_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = section; | |
} | |
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){ | |
var printurl_xPath = ".//div[@id='spArticleFunctions']/ul/li[1]/a"; | |
var printurl = doc.evaluate(printurl_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href; | |
Zotero.debug(printurl); | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"application/pdf"}); | |
} else { | |
// Attachment. Difficult. They want something inserted into the URL. | |
var printurl = doc.location.href; | |
printurl = printurl.replace(/(\d+\,\d+\.html.*$)/, 'druck-$1'); //done! | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"}); | |
} | |
// Summary | |
var summary_xPath = ".//p[@id='spIntroTeaser']"; | |
if (doc.evaluate(summary_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var summary= doc.evaluate(summary_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.abstractNote = Zotero.Utilities.trim(summary); | |
} | |
// Date - sometimes xpath1 doesn't yield anything. Fortunately, there's another possibility... | |
var date1_xPath = ".//h5[contains(@id, 'ShortDate')]"; | |
var date2_xPath = "//meta[@name='date']"; | |
if (doc.evaluate(date1_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var date= doc.evaluate(date1_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
if (date.match('/')) { | |
date = date.replace(/(\d\d)\/(\d\d)\/(\d\d\d\d)/, "$2.$1.$3"); | |
} | |
} else if (doc.evaluate(date2_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var date= doc.evaluate(date2_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
date=date.replace(/(\d\d\d\d)-(\d\d)-(\d\d)/, '$3.$2.$1'); | |
} | |
newItem.date = Zotero.Utilities.trim(date); | |
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){ | |
newItem.publicationTitle = "Der Spiegel"; | |
}else { | |
newItem.publicationTitle = "Spiegel Online"; | |
} | |
newItem.complete() | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/thema/)){ | |
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/h3/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/suche/)){ | |
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/search/)){ | |
var titles = doc.evaluate("//*[@id='spTeaserColumn']/div/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/topic/)){ | |
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/h3/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
//The search searches also manager-magazin.de, which won't work | |
if (next_title.textContent != "mehr..." && next_title.href.match(/^http:\/\/www\.spiegel\.de\//) ) { | |
items[next_title.href] = Zotero.Utilities.trim(next_title.textContent); | |
} | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "2e4ebd19-83ab-4a56-8fa6-bcd52b576470", | |
"label": "Sueddeutsche.de", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.sueddeutsche\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 15:02:54" | |
} | |
/* | |
Sueddeutsche.de Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
This one has the search function on a different host, so I cannot scan the search results. A multiple option, though, is given for the page itself. | |
Test here: | |
http://www.sueddeutsche.de/politik | |
http://www.sueddeutsche.de/thema/Krieg_in_Libyen | |
http://www.sueddeutsche.de/muenchen | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var SZ_ArticleTitle_XPath = ".//h1[@id='articleTitle']"; | |
var SZ_Multiple_XPath = ".//*[contains(@class, 'maincolumn')]/ol/li/a|.//*[contains(@class, 'maincolumn')]/ol/li/ul/li/a"; | |
if (doc.evaluate(SZ_ArticleTitle_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.evaluate(SZ_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var title_XPath =".//h1[@id='articleTitle']"; | |
// This is clumsy, but it excludes image galleries, which link fine but then are not articles. The closing bracket is right at the end of scrape(). | |
if (doc.evaluate(title_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! | |
var title_XPath = '//meta[contains(@property, "og:title")]'; | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.title = Zotero.Utilities.trim(title.replace(/\s?–\s?/, ": ")); | |
// Author. This is tricky, the SZ uses the author field for whatever they like. Sometimes, there is no author. | |
var author_XPath = './/span[contains(@class, "hcard fn")]'; | |
// If there is an author, use it. Otherwise: "" | |
if (doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
author = author.replace(/^Von\s/, ''); | |
} else { | |
var author = ""; | |
} | |
// One case i've seen: A full sentence as the "author", with no author in it. "" | |
if (author.match(/\.$/)){ | |
author = ""; | |
} | |
// For multiple Authors, the SZ uses comma, und and u. separate em, and put them into an array of strings. | |
author = author.split(/\sund\s|\su\.\s|\,\s/); | |
Zotero.debug(author); | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// Now the summary | |
var summary_XPath = '//meta[contains(@property, "og:description")]'; | |
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.abstractNote = summary; | |
// Date | |
var date_XPath = ".//*[@class='updated']/*[@class='value']"; | |
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
date = date.split(/\s/)[0]; | |
newItem.date = date; | |
// Section | |
var section_XPath = "//meta[contains(@name, 'keywords')]"; | |
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
section = section.split(",")[0]; | |
newItem.section = section; | |
// Tags | |
var tags_XPath = ".//ul[@class='themen']" | |
var tags= doc.evaluate(tags_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
tags = tags.replace(/^\s*|\s*$/g, ''); | |
tags = tags.split(/\n/); | |
for (var i in tags) { | |
tags[i] = tags[i].replace(/^\s*|\s*$/g, ''); | |
newItem.tags.push(tags[i]); | |
} | |
// Publikation | |
newItem.publicationTitle = "sueddeutsche.de" | |
// Attachment. Difficult. They want something inserted into the URL. | |
var printurl = doc.location.href; | |
printurl = printurl.replace(/(.*\/)(.*$)/, '$12.220/$2'); //done! | |
Zotero.debug(printurl); | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"}); | |
newItem.complete() | |
} | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate(".//*[contains(@class, 'maincolumn')]/ol/li/a|.//*[contains(@class, 'maincolumn')]/ol/li/ul/li/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
if (next_title.href.match(/^http\:\/\/www\.sueddeutsche\.de/)) { | |
items[next_title.href] = Zotero.Utilities.trim(next_title.textContent); | |
items[next_title.href] =items[next_title.href].replace(/\n/, ''); | |
items[next_title.href] =items[next_title.href].replace(/\s–|—/g, ': '); | |
items[next_title.href] =items[next_title.href].replace(/\s+/g, ' '); | |
} | |
} | |
items = Zotero.selectItems(items); | |
Zotero.debug(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "374ac2a5-dd45-461e-bf1f-bf90c2eb7085", | |
"label": "Der Tagesspiegel", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.tagesspiegel\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-30 22:04:46" | |
} | |
/* | |
Tagesspiegel Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var tspiegel_ArticleTools_XPath = ".//div[@class='hcf-article']"; | |
var tspiegel_Multiple_XPath = "//*[@id='hcf-wrapper']/div[2]/div[contains(@class, 'hcf-main-col')]/div/ul/li/h2/a|//*[@id='hcf-wrapper']/div[@class='hcf-lower-hp']/div/ul/li/ul/li/a|//ul/li[contains(@class, 'hcf-teaser')]/h2/a"; | |
if (doc.evaluate(tspiegel_ArticleTools_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.location.href.match(/http\:\/\/www\.tagesspiegel\.de\/suchergebnis\//)){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.evaluate(tspiegel_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) { | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! | |
var title_XPath = "//div[@class='hcf-article']/h1"; | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.title = title; | |
// Date | |
var date_XPath = "//span[contains(@class, 'hcf-date')]"; | |
var date= doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.date= date.replace(/(.{10,10}).*/, '$1'); | |
// Summary | |
var summary_XPath = ".//p[@class='hcf-teaser']" | |
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.abstractNote = Zotero.Utilities.trim(summary); | |
} | |
// Publication Title | |
newItem.publicationTitle = "Der Tagesspiegel Online"; | |
// Authors | |
var author_XPath = "//span[contains(@class, 'hcf-author')]"; | |
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
Zotero.debug(author); | |
author = author.replace(/^Von\s|Kommentar\svon\s/g, ''); | |
author = author.split(/,\s/); | |
for (var i in author) { | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// Printurl (add "v_print," before the article ID and "?p=" at the end) | |
var printurl = doc.location.href.replace(/^(.*\/)(\d+.html$)/, '$1v_print,$2?p='); | |
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"}); | |
// Tags | |
var tags_XPath = "//meta[@name='keywords']"; | |
var tags = doc.evaluate(tags_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
var tags= tags.split(","); // this seems to work even if there's no | | |
for (var i in tags) { | |
tags[i] = tags[i].replace(/^\s*|\s*$/g, '') // remove whitespace around the tags | |
newItem.tags.push(tags[i]); | |
} | |
newItem.complete(); | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate("//*[@id='hcf-wrapper']/div[2]/div[contains(@class, 'hcf-main-col')]/div/ul/li/h2/a|//*[@id='hcf-wrapper']/div[@class='hcf-lower-hp']/div/ul/li/ul/li/a|//ul/li[contains(@class, 'hcf-teaser')]/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
// The following conditions excludes the image galleries and videos. | |
if (next_title.href.match(/http\:\/\/www\.tagesspiegel\.de\/(?!mediacenter)/)) { | |
items[next_title.href] = next_title.textContent; | |
} | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "d84574f1-e4d6-4337-934f-bf9d01173bf0", | |
"label": "taz.de", | |
"creator": "Martin Meyerhoff", | |
"target": "https?://www\\.taz\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-26 17:18:22" | |
} | |
/* | |
taz.de Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
This site is rather heterogenous when it comes to where the author is and all that. | |
Whenever the script doesn't find something it just returns an empty field. | |
Try on: | |
http://www.taz.de/ | |
http://www.taz.de/1/archiv/detailsuche/?tx_hptazsearch_pi1[search_term]=Krise&tx_hptazsearch_pi2[submit_button].x=0&tx_hptazsearch_pi2[submit_button].y=0 | |
http://www.taz.de/1/debatte/kolumnen/artikel/1/haengt-sie-hoeher-1/ | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var taz_ArticleTitle_XPath = ".//h1"; | |
var taz_Multiple_XPath = ".//*[@id='hauptspalte']/div/ul/li/a/h3"; | |
var taz_Search_XPath = ".//*[@id='hauptspalte']/div/div/ul/li/a/h3"; | |
if (doc.evaluate(taz_ArticleTitle_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.evaluate(taz_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} else if (doc.evaluate(taz_Search_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function authorCase(author) { // Turns All-Uppercase-Authors to normally cased Authors | |
var words = author.split(/\s|-/); | |
var authorFixed = ''; | |
for (var i in words) { | |
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase(); | |
authorFixed = authorFixed + words[i] + ' '; | |
} | |
return(authorFixed); | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! | |
var title_XPath = '//title'; | |
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.title = title.split(" - ")[0]; | |
// Summary | |
var description_XPath = '//meta[contains(@name, "description")]'; | |
var description = doc.evaluate(description_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
summary = description.replace(/\sVON.*$/g, ''); | |
newItem.abstractNote = summary.replace(/KOMMENTAR|KOLUMNE.*$/g, ''); | |
// Authors | |
var author_XPath = "//*[contains(@class, 'sectbody')]/*/span[contains(@class, 'author')]"; | |
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) { | |
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
} else if (description.match(/^(KOMMENTAR)|(KOLUMNE)\sVON/)){ | |
Zotero.debug(description); | |
author = description.replace(/^(KOMMENTAR)|(KOLUMNE)\sVON\s/, ''); | |
} else { | |
var author = ""; | |
} | |
author = author.replace(/^\s*|\s*$/g, ''); | |
author = author.replace(".", ". "); // in case a space is missing. | |
author = author.replace("VON ", ''); | |
author = author.replace(/\s+/g, ' '); | |
author = author.split(/\sund\s|\su\.\s|\,\s|\&/); | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
author[i] = author[i].replace(/^\s*|\s*$/g, ''); | |
author[i] = authorCase(author[i]); | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// Section | |
var section_XPath = ".//*[contains(@class, 'selected')]/ul/li[contains(@class, 'selected')]"; | |
if (doc.evaluate(section_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) { | |
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = section; | |
} | |
// Date | |
var date_XPath = ".//div[contains(@class, 'secthead')]"; | |
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
date = date.replace(/^\s*|\s*$/g, ''); | |
date = date.substr(0,10); | |
newItem.date = date; | |
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"}); | |
newItem.publicationTitle = "die tageszeitung" | |
newItem.complete(); | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var taz_Multiple_XPath = ".//*[@id='hauptspalte']/div/ul/li/a"; | |
var taz_Search_XPath = ".//*[@id='hauptspalte']/div/div/ul/li/a"; | |
if (doc.evaluate(taz_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var titles = doc.evaluate(taz_Multiple_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} else if (doc.evaluate(taz_Search_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
var titles = doc.evaluate(taz_Search_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null); | |
} | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
items[next_title.href] = next_title.innerHTML; | |
items[next_title.href] = items[next_title.href].replace(/(\<h4.*?\>.*?\<\/h4\>\<h3.*?\>)(.*)\<\/h3\>.*/, '$2'); | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"translatorID": "f61beec2-1431-4218-a9d3-68063ede6ecd", | |
"label": "Welt Online", | |
"creator": "Martin Meyerhoff", | |
"target": "^http://www\\.welt\\.de", | |
"minVersion": "1.0", | |
"maxVersion": "", | |
"priority": 100, | |
"inRepository": "1", | |
"translatorType": 4, | |
"lastUpdated": "2011-03-29 18:43:49" | |
} | |
/* | |
Welt Online Translator | |
Copyright (C) 2011 Martin Meyerhoff | |
This program is free software: you can redistribute it and/or modify | |
it under the terms of the GNU General Public License as published by | |
the Free Software Foundation, either version 3 of the License, or | |
(at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program. If not, see <http://www.gnu.org/licenses/>. | |
*/ | |
/* | |
"Multiple" doesn't work on the search pages, because that's another host. However, every other page does it: | |
http://www.welt.de/themen/Fukushima/ | |
http://www.welt.de/wirtschaft/ | |
http://www.welt.de/wirtschaft/article12962920/Krankenkassen-werfen-Aerzten-Gewinnstreben-vor.html | |
*/ | |
function detectWeb(doc, url) { | |
// I use XPaths. Therefore, I need the following block. | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var welt_article_XPath = ".//meta[contains(@property, 'og:type')]"; | |
var welt_multiple_XPath = ".//div[contains(@class, 'h2')]/a"; | |
if (doc.evaluate(welt_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("newspaperArticle"); | |
return "newspaperArticle"; | |
} else if (doc.evaluate(welt_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ | |
Zotero.debug("multiple"); | |
return "multiple"; | |
} | |
} | |
function scrape(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var newItem = new Zotero.Item("newspaperArticle"); | |
newItem.url = doc.location.href; | |
// This is for the title! Welt's titles are ok without their "supertitles". They seem to convey - nothing. | |
var xPath = ".//meta[contains(@property, 'og:title')]"; | |
var title = doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.title = title; | |
// Authors | |
var xPath = ".//meta[contains(@name, 'author')]"; | |
var author= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
if (author == "WELT ONLINE") { | |
author = ""; | |
} | |
author = author.split(/\sund\s|\su\.\s|\,\s|\&|Und/); | |
for (var i in author) { | |
if (author[i].match(/\s/)) { // only names that contain a space! | |
author[i] = author[i].replace(/^\s*|\s*$/g, ''); | |
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author")); | |
} | |
} | |
// Summary | |
var xPath = '//meta[contains(@name, "description")]'; | |
var summary = doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
newItem.abstractNote = summary; | |
// Tags | |
var xPath = '//meta[contains(@name, "keywords")]'; | |
var tags= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content; | |
tags = tags.split(/,\s/); | |
if (tags[0] != "" ) { | |
for (var i in tags) { | |
tags[i] = tags[i].replace(/^\s*|\s*$/g, ''); | |
newItem.tags.push(tags[i]); | |
} | |
} | |
// Date | |
var xPath = ".//span[contains(@class, 'date')][last()]"; | |
var date= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.date = date; | |
// Publikation (I can only distinguish some articles from Welt am Sonntag by their URL, otherwise its all mishmash) | |
if (doc.location.href.match(/.*wams_print.*/)) { | |
newItem.publicationTitle = "Welt am Sonntag"; | |
} else { | |
newItem.publicationTitle = "Welt Online"; | |
} | |
// Section | |
var xPath = ".//*[@id='mainNavi']/ul/li[contains(@class, 'menAc')]/a"; | |
var section= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; | |
newItem.section = section; | |
// Attachment | |
newItem.attachments.push({url:doc.location.href+"?print=true", title:doc.title, mimeType:"text/html"}); | |
newItem.complete() | |
} | |
function doWeb(doc, url) { | |
var namespace = doc.documentElement.namespaceURI; | |
var nsResolver = namespace ? function(prefix) { | |
if (prefix == 'x') return namespace; else return null; | |
} : null; | |
var articles = new Array(); | |
if (detectWeb(doc, url) == "multiple") { | |
var items = new Object(); | |
var titles = doc.evaluate(".//div[contains(@class, 'h2')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null); | |
var next_title; | |
while (next_title = titles.iterateNext()) { | |
items[next_title.href] = next_title.textContent; | |
} | |
items = Zotero.selectItems(items); | |
for (var i in items) { | |
articles.push(i); | |
} | |
} else { | |
articles = [url]; | |
} | |
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); | |
Zotero.wait(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment