Skip to content

Instantly share code, notes, and snippets.

@mamhoff
Created March 18, 2011 19:02
Show Gist options
  • Save mamhoff/876639 to your computer and use it in GitHub Desktop.
Save mamhoff/876639 to your computer and use it in GitHub Desktop.
site translator for taz.de, Potsdamer Neueste Nachrichten, Der Freitag, Süddeutsche Zeitung, Frankfurter Rundschau, Spiegel Online, Welt Online, Tagesspiegel, Le Monde Diplomatique (de)
{
"translatorID": "1ab8b9a4-72b5-4ef4-adc8-4956a50718f7",
"label": "Der Freitag",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.freitag\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 15:55:00"
}
/*
Der Freitag Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
This site is good, but very, very slow. So when importing multiple Items, be patient!
http://www.freitag.de/search?modus=articles&SearchableText=Gaddafi*
http://www.freitag.de
http://www.freitag.de/guardian-world
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var Freitag_Artikel_XPath = '//div[contains(@class, "artikel_content")]/h2';
var Freitag_multiple_XPath = ".//h3[contains(@class, 'listing')]/a";
if (doc.evaluate(Freitag_Artikel_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ // Diese Zeile verhindert die aus dem Tagesspiegel übernommenen Artikel!
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.evaluate(Freitag_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){ // Diese Zeile verhindert die aus dem Tagesspiegel übernommenen Artikel!
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the author and date
var meta_XPath = "//div[contains(@class, 'article-heading-meta-left')]"
var meta = doc.evaluate(meta_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
meta = meta.split("|");
for (var i in meta) {
meta[i] = meta[i].replace(/^\s*|\s*$/g, '');
}
newItem.date = meta[1].split(/\s/)[0];
// author
var author = meta[2].split(/\sund\s|\su\.\s|\,\s|\//);
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// title
var title_XPath = '//title';
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
title = title.split(/\:|\—/);
for (var i in title) {
title[i] = title[i].replace(/^\s*|\s*$/g, '');
}
newItem.title = ""
newItem.title = newItem.title.concat(title[0], ": ", title[1]);
newItem.publicationTitle = "Der Freitag";
// Summary
var summary_XPath = "//div[@class='artikel_content']/h3";
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.abstractNote = Zotero.Utilities.trim(summary);
// no Tags, because Der Freitag doesn't supply any.
// Section
var section_XPath = "//h1";
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section= section;
// Snapshot
var printurl_XPath = ".//a[@id='article-drucken']"
var printurl= doc.evaluate(printurl_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href;
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"});
newItem.complete()
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate(".//h3[contains(@class, 'listing')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
items[next_title.href] = next_title.textContent;
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "488fe1e0-b7d2-406f-8257-5060418ce9b2",
"label": "fr-online.de",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.fr-online\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 15:45:54"
}
/*
fr-online.de Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
Works w/ search and overviews. I had to include the ugly hack stopping non-articles (photo-streams) to make the multiple item import return an error. Test on:
http://www.fr-online.de/politik/spezials/wikileaks---die-enthuellungsplattform/-/4882932/4882932/-/index.html
http://www.fr-online.de/page/search/fr-online/home/suche/-/1473784/1473784/-/view/asSearch/-/index.html?contextsIds=1472660&docTypes=%22MauArticle,MauGallery,DMBrightcoveVideo,CMDownload,DMMovie,DMEvent,DMVenue%22&offset=5&pageNumber=2&searchMode=SIMPLEALL&sortBy=maupublicationdate&userQuery=Wikileaks
http://www.fr-online.de/wirtschaft/krise/-/1471908/1471908/-/index.html
http://www.fr-online.de/wirtschaft/krise/portugal-koennte-rettungspaket-benoetigen/-/1471908/8251842/-/index.html
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var FR_article_XPath = ".//div[contains(@class, 'ArticleToolBoxIcons')]";
var FR_multiple_XPath = ".//*[@id='ContainerContent']/div/div[contains(@class, 'Headline2')]/a"
if (doc.evaluate(FR_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.location.href.match(/^http\:\/\/www\.fr-online\.de\/.*?page\/search/) ) {
Zotero.debug("multiple");
return "multiple";
} else if (doc.evaluate(FR_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
}
}
function authorCase(author) { // Turns All-Uppercase-Authors to normally cased Authors
var words = author.split(/\s/);
var authorFixed = '';
for (var i in words) {
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
authorFixed = authorFixed + words[i] + ' ';
}
return(authorFixed);
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var FR_article_XPath = ".//div[contains(@class, 'ArticleToolBoxIcons')]"; // this protects against galleries...
if (doc.evaluate(FR_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title!
var title_XPath = '//title'
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.title = title.split("|")[0].replace(/^\s*|\s*$/g, '');
// This is for the author!
var author_XPath = '//meta[@name="author"]';
var author= doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
author = author.split(/\,\s|\sund\s/g);
if (author[0].match(/Rundschau/)) { // Frankfurter Rundschau is no author.
author[0] = "";
}
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
author[i] = Zotero.Utilities.trim(author[i]);
author[i] = authorCase(author[i]);
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
//Summary
var summary_XPath = '//meta[@name="description"]';
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var summary= doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
newItem.abstractNote = Zotero.Utilities.trim(summary);
}
//Date
var date_XPath = ".//div[contains(@class, 'TB_Date')]";
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
date = date.replace(/^\s*Datum\:\s|\s/g, ''); // remove "Datum: " and " "
date = date.split("|");
var realdate = "";
realdate = realdate.concat(date[2], "-", date[1], "-", date[0]);
newItem.date = realdate;
// No Tags. FR does not provide consistently meaningful ones.
// Publikation
newItem.publicationTitle = "fr-online.de"
// Section
var section_XPath = '//title'
var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
section = section.split(/\||-/);
newItem.section = section[1].replace(/^\s*|\s*$/g, '');
// Attachment
var printurl = doc.location.href;
if (printurl.match("asFirstTeaser")) {
printurl = printurl.replace("asFirstTeaser", "printVersion");
} else {
printurl = printurl.replace(/\-\/index.html$/, "-/view/printVersion/-/index.html");
}
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"});
newItem.complete()
}
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate(".//*[@id='ContainerContentLinie']/div/h2/a|.//*[@id='ContainerContent']/div/div[contains(@class, 'Headline2')]/a|.//*[@id='ContainerContent']/div/div/div[contains(@class, 'link_article')]/a|.//*[@id='Main']/div[contains(@class, '2ColHP')]/div/div/div[contains(@class, 'Headline2')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
// This excludes the videos, whos link terminates in a hash.
if (next_title.href.match(/.*html$/)) {
items[next_title.href] = next_title.textContent;
}
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "530cf18c-e80a-4e67-ae9c-9b8c08591610",
"label": "Le monde diplomatique",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.monde-diplomatique\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 16:50:57"
}
/*
Le Monde Diplomatique (de) Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
Works really well. Try here:
http://www.monde-diplomatique.de/pm/2011/02/11/a0054.text.name,askexfz1c.n,0
http://www.monde-diplomatique.de/pm/.search?tx=Globalisierung
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
if (url.match(/^http:\/\/www\.monde-diplomatique\.de\/pm\/\d\d\d\d\/\d\d/) ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (url.match(/search/) ) {
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var title_XPath = ".//*[@id='haupt']/div/h3"
if (doc.evaluate(title_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title!
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.title = Zotero.Utilities.trim(title);
// Now for the Author
var author_XPath = ".//*[@id='haupt']/div/h4";
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
author = author.replace(/^\s*von\s|\s*$/g, ''); // remove whitespace around the author and the "Von "at the beginning
} else {
var author = "";
}
var author = author.split(" | "); // this seems to work even if there's no |
for (var i in author) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
// No Tags
// Date
var date_XPath = ".//*[@id='haupt']/h2"
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
date = date.split(" vom ")[1];
newItem.date = date;
// Summary
var summary_XPath = ".//*[@id='haupt']/div/h5"
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.abstractNote = Zotero.Utilities.trim(summary);
}
newItem.publicationTitle = "Le Monde Diplomatique";
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"});
newItem.complete()
}
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate("//*[@id='haupt']/div/p/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
if (next_title.href.match(/^http:\/\/www\.monde-diplomatique\.de\/pm\/\d\d\d\d\/\d\d/) ){
items[next_title.href] = next_title.textContent;
}
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "9405db4b-be7f-42ab-86ca-430226be9b35",
"label": "Potsdamer Neueste Nachrichten",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.pnn\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 13:42:35"
}
/*
Potsdamer Neueste Nachrichten Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
The articles themselves are quite badly tagged, so that the translator sometimes doesn't capture the summary or the authors.
Test it with:
http://www.pnn.de/archiv/?type=archiv&phrase=Krise
http://www.pnn.de/zeitung/
http://www.pnn.de/zeitung/12.01.2011/
http://www.pnn.de/titelseite/364860/
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var PNN_Article_XPath = ".//a[contains(@class, 'print')]"; //only articles have a print button.
var PNN_Multiple_XPath = ".//ul/li/h2/a"
if (doc.evaluate(PNN_Article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.evaluate(PNN_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// Title
var title_XPath = '//title'
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
title = title.split("—")[0]; // split at mdash
title = title.replace(/\„|\“/g, '"'); // standard quotation marks
title = title.replace(/|^\s*|\s*$/, ''); // remove whitespace
newItem.title = title;
// Summary
var summary_XPath = ".//p[contains(@class, 'teaser')]";
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
summary=summary.replace(/\(.*\)/, ''); // No date in the summary.
summary=summary.replace(/^\s*|\s*$/g, ''); //remove white space
newItem.abstractNote = summary;
// Date
var date_XPath = "//*[contains(@class, 'teaser')]/span[contains(@class, 'date')]";
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
date = date.replace(/\(|\)|^\s*|\s*$/g, ''); // remove whitespace and braces
newItem.date = date;
// Authors. Tricky. Actually, horrible. I hope they change their site at some point and this mess can be cleaned up.
var temp = new Array();
temp[0] = ""
var author_XPath = ".//*[@id='teaser']/p/i"; // Sometimes, the author is in italics in the paragraph. Easy Case, really.
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
temp[0] = author;
} else {
author_XPath = ".//*[@id='teaser']"; // basically, grab the entire article. no other chance.
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
author = author.replace(/\s\s\s*/g, "|"); // replace lots of white space (indicative of a line break / paragraph)
author = author.split("|");
// Zotero.debug(author);
var author_searchpattern1 = /^Von(.*)/; // These three patterns capture the majority of authors.
var author_searchpattern2 = /^Das\sGespräch\sführte(.*)\.$/;
var author_searchpattern3 = /^Interview\:\s(.*)Foto:.*/;
for (var i in author) {
if (temp[0] == "") {
if (author[i].match(author_searchpattern1)) {
var temp = author[i].match(author_searchpattern1);
temp[0] = temp[0].replace(author_searchpattern1, "$1");
}
if (author[i].match(author_searchpattern2)) {
var temp = author[i].match(author_searchpattern2);
temp[0] = temp[0].replace(author_searchpattern2, "$1");
}
if (author[i].match(author_searchpattern3)) {
var temp = author[i].match(author_searchpattern3);
temp[0] = temp[0].replace(author_searchpattern3, "$1");
}
}
}
}
var realauthor = temp[0].replace(/^\s*|\s*$/g, '');
realauthor = realauthor.split(/\sund\s|\su\.\s|\,\s/);
for (var i in realauthor) {
if (realauthor[i].match(/\s/)) { // only names that contain a space!
newItem.creators.push(Zotero.Utilities.cleanAuthor(realauthor[i], "author"));
}
}
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"});
newItem.publicationTitle = "Potsdamer Neueste Nachrichten"
// section
var section_XPath = ".//*[@id='sidebar-left']/ul/li[contains(@class, 'active')]";
var section = doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section = section.replace(/^\s*|\s*$/g, '');
newItem.complete();
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate(".//ul/li/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
items[next_title.href] = next_title.textContent;
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "eef50507-c756-4081-86fd-700ae4ebf22e",
"label": "Spiegel Online",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.spiegel\\.de/",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-04-01 11:56:06"
}
/*
Spiegel Online Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
Test with the following URLs:
http://www.spiegel.de/suche/index.html?suchbegriff=AKW
http://www.spiegel.de/international/search/index.html?suchbegriff=Crisis
http://www.spiegel.de/international/topic/german_french_relations/
http://www.spiegel.de/international/europe/0,1518,700530,00.html
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var spiegel_article_XPath = ".//div[@id='spArticleFunctions']";
if (doc.evaluate(spiegel_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/thema/)){
Zotero.debug("multiple");
return "multiple";
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/suche/)){
Zotero.debug("multiple");
return "multiple";
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/search/)){
Zotero.debug("multiple");
return "multiple";
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/topic/)){
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title
var title_xPath = ".//*[@id='spArticleColumn']/h2|.//*[@id='spArticleColumn ']/h2";
if (doc.evaluate(title_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var title = doc.evaluate(title_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.title = title;
} else {
var title = doc.evaluate('//title', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
title = title.split(" - ")[0];
newItem.title = title;
}
// Tags
var tags_xPath = '//meta[contains(@name, "keywords")]';
var tags= doc.evaluate(tags_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
tags = tags.split(/,/);
tags = tags.slice(5); // The first six 5 Tags are generic or section info.
if (tags[0] != "" ) {
for (var i in tags) {
tags[i] = tags[i].replace(/^\s*|\s*$/g, '');
newItem.tags.push(tags[i]);
}
}
// Author
var author_XPath1 = ".//p[contains(@class, 'spAuthor')]"; // Most of the time, the author has its own tag. Easy Case, really.
var author_XPath2 = ".//*[@id='spIntroTeaser']/strong/i"; // Sometimes, though, the author is in italics in the teaser.
if (doc.evaluate(author_XPath1, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var author = doc.evaluate(author_XPath1, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
Zotero.debug(author);
} else if (doc.evaluate(author_XPath2, doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
var author = doc.evaluate(author_XPath2, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
Zotero.debug(author);
} else {
author = "";
}
author = author.replace(/^\s*By\s|^\s*Von\s|\s*$/g, ''); // remove whitespace around the author and the "Von "at the beginning
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){ // Spiegel Online and the Spiegel Archive have different formatting for the author line
author = author.split(/\sund\s|\su\.\s|\;\s|\sand\s/);
for (var i in author) {
author[i] = author[i].replace(/(.*),\s(.*)/, '$2 $1');
}
} else {
author = author.replace(/,\s|in\s\S*$/, ""); //remove ", location" or "in location"
author = author.split(/\sund\s|\su\.\s|\,\s|\sand\s/);
}
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// Section
var section_xPath = ".//ul[contains(@id, 'spChannel')]/li/ul/li/a[contains(@class, 'spActive')]";
if (doc.evaluate(section_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var section = doc.evaluate(section_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section = section;
}
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){
var printurl_xPath = ".//div[@id='spArticleFunctions']/ul/li[1]/a";
var printurl = doc.evaluate(printurl_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().href;
Zotero.debug(printurl);
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"application/pdf"});
} else {
// Attachment. Difficult. They want something inserted into the URL.
var printurl = doc.location.href;
printurl = printurl.replace(/(\d+\,\d+\.html.*$)/, 'druck-$1'); //done!
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"});
}
// Summary
var summary_xPath = ".//p[@id='spIntroTeaser']";
if (doc.evaluate(summary_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var summary= doc.evaluate(summary_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.abstractNote = Zotero.Utilities.trim(summary);
}
// Date - sometimes xpath1 doesn't yield anything. Fortunately, there's another possibility...
var date1_xPath = ".//h5[contains(@id, 'ShortDate')]";
var date2_xPath = "//meta[@name='date']";
if (doc.evaluate(date1_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var date= doc.evaluate(date1_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
if (date.match('/')) {
date = date.replace(/(\d\d)\/(\d\d)\/(\d\d\d\d)/, "$2.$1.$3");
}
} else if (doc.evaluate(date2_xPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var date= doc.evaluate(date2_xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
date=date.replace(/(\d\d\d\d)-(\d\d)-(\d\d)/, '$3.$2.$1');
}
newItem.date = Zotero.Utilities.trim(date);
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/spiegel/)){
newItem.publicationTitle = "Der Spiegel";
}else {
newItem.publicationTitle = "Spiegel Online";
}
newItem.complete()
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/thema/)){
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/h3/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/suche/)){
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/search/)){
var titles = doc.evaluate("//*[@id='spTeaserColumn']/div/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
} else if (doc.location.href.match(/^http\:\/\/www\.spiegel\.de\/international\/topic/)){
var titles = doc.evaluate(".//*[@id='spTeaserColumn']/div/h3/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
}
var next_title;
while (next_title = titles.iterateNext()) {
//The search searches also manager-magazin.de, which won't work
if (next_title.textContent != "mehr..." && next_title.href.match(/^http:\/\/www\.spiegel\.de\//) ) {
items[next_title.href] = Zotero.Utilities.trim(next_title.textContent);
}
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "2e4ebd19-83ab-4a56-8fa6-bcd52b576470",
"label": "Sueddeutsche.de",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.sueddeutsche\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 15:02:54"
}
/*
Sueddeutsche.de Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
This one has the search function on a different host, so I cannot scan the search results. A multiple option, though, is given for the page itself.
Test here:
http://www.sueddeutsche.de/politik
http://www.sueddeutsche.de/thema/Krieg_in_Libyen
http://www.sueddeutsche.de/muenchen
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var SZ_ArticleTitle_XPath = ".//h1[@id='articleTitle']";
var SZ_Multiple_XPath = ".//*[contains(@class, 'maincolumn')]/ol/li/a|.//*[contains(@class, 'maincolumn')]/ol/li/ul/li/a";
if (doc.evaluate(SZ_ArticleTitle_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.evaluate(SZ_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var title_XPath =".//h1[@id='articleTitle']";
// This is clumsy, but it excludes image galleries, which link fine but then are not articles. The closing bracket is right at the end of scrape().
if (doc.evaluate(title_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title!
var title_XPath = '//meta[contains(@property, "og:title")]';
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
newItem.title = Zotero.Utilities.trim(title.replace(/\s?–\s?/, ": "));
// Author. This is tricky, the SZ uses the author field for whatever they like. Sometimes, there is no author.
var author_XPath = './/span[contains(@class, "hcard fn")]';
// If there is an author, use it. Otherwise: ""
if (doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
author = author.replace(/^Von\s/, '');
} else {
var author = "";
}
// One case i've seen: A full sentence as the "author", with no author in it. ""
if (author.match(/\.$/)){
author = "";
}
// For multiple Authors, the SZ uses comma, und and u. separate em, and put them into an array of strings.
author = author.split(/\sund\s|\su\.\s|\,\s/);
Zotero.debug(author);
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// Now the summary
var summary_XPath = '//meta[contains(@property, "og:description")]';
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
newItem.abstractNote = summary;
// Date
var date_XPath = ".//*[@class='updated']/*[@class='value']";
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
date = date.split(/\s/)[0];
newItem.date = date;
// Section
var section_XPath = "//meta[contains(@name, 'keywords')]";
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
section = section.split(",")[0];
newItem.section = section;
// Tags
var tags_XPath = ".//ul[@class='themen']"
var tags= doc.evaluate(tags_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
tags = tags.replace(/^\s*|\s*$/g, '');
tags = tags.split(/\n/);
for (var i in tags) {
tags[i] = tags[i].replace(/^\s*|\s*$/g, '');
newItem.tags.push(tags[i]);
}
// Publikation
newItem.publicationTitle = "sueddeutsche.de"
// Attachment. Difficult. They want something inserted into the URL.
var printurl = doc.location.href;
printurl = printurl.replace(/(.*\/)(.*$)/, '$12.220/$2'); //done!
Zotero.debug(printurl);
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"});
newItem.complete()
}
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate(".//*[contains(@class, 'maincolumn')]/ol/li/a|.//*[contains(@class, 'maincolumn')]/ol/li/ul/li/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
if (next_title.href.match(/^http\:\/\/www\.sueddeutsche\.de/)) {
items[next_title.href] = Zotero.Utilities.trim(next_title.textContent);
items[next_title.href] =items[next_title.href].replace(/\n/, '');
items[next_title.href] =items[next_title.href].replace(/\s–|—/g, ': ');
items[next_title.href] =items[next_title.href].replace(/\s+/g, ' ');
}
}
items = Zotero.selectItems(items);
Zotero.debug(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "374ac2a5-dd45-461e-bf1f-bf90c2eb7085",
"label": "Der Tagesspiegel",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.tagesspiegel\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-30 22:04:46"
}
/*
Tagesspiegel Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var tspiegel_ArticleTools_XPath = ".//div[@class='hcf-article']";
var tspiegel_Multiple_XPath = "//*[@id='hcf-wrapper']/div[2]/div[contains(@class, 'hcf-main-col')]/div/ul/li/h2/a|//*[@id='hcf-wrapper']/div[@class='hcf-lower-hp']/div/ul/li/ul/li/a|//ul/li[contains(@class, 'hcf-teaser')]/h2/a";
if (doc.evaluate(tspiegel_ArticleTools_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.location.href.match(/http\:\/\/www\.tagesspiegel\.de\/suchergebnis\//)){
Zotero.debug("multiple");
return "multiple";
} else if (doc.evaluate(tspiegel_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) {
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title!
var title_XPath = "//div[@class='hcf-article']/h1";
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.title = title;
// Date
var date_XPath = "//span[contains(@class, 'hcf-date')]";
var date= doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.date= date.replace(/(.{10,10}).*/, '$1');
// Summary
var summary_XPath = ".//p[@class='hcf-teaser']"
if (doc.evaluate(summary_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var summary = doc.evaluate(summary_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.abstractNote = Zotero.Utilities.trim(summary);
}
// Publication Title
newItem.publicationTitle = "Der Tagesspiegel Online";
// Authors
var author_XPath = "//span[contains(@class, 'hcf-author')]";
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
Zotero.debug(author);
author = author.replace(/^Von\s|Kommentar\svon\s/g, '');
author = author.split(/,\s/);
for (var i in author) {
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// Printurl (add "v_print," before the article ID and "?p=" at the end)
var printurl = doc.location.href.replace(/^(.*\/)(\d+.html$)/, '$1v_print,$2?p=');
newItem.attachments.push({url:printurl, title:doc.title, mimeType:"text/html"});
// Tags
var tags_XPath = "//meta[@name='keywords']";
var tags = doc.evaluate(tags_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
var tags= tags.split(","); // this seems to work even if there's no |
for (var i in tags) {
tags[i] = tags[i].replace(/^\s*|\s*$/g, '') // remove whitespace around the tags
newItem.tags.push(tags[i]);
}
newItem.complete();
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate("//*[@id='hcf-wrapper']/div[2]/div[contains(@class, 'hcf-main-col')]/div/ul/li/h2/a|//*[@id='hcf-wrapper']/div[@class='hcf-lower-hp']/div/ul/li/ul/li/a|//ul/li[contains(@class, 'hcf-teaser')]/h2/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
// The following conditions excludes the image galleries and videos.
if (next_title.href.match(/http\:\/\/www\.tagesspiegel\.de\/(?!mediacenter)/)) {
items[next_title.href] = next_title.textContent;
}
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "d84574f1-e4d6-4337-934f-bf9d01173bf0",
"label": "taz.de",
"creator": "Martin Meyerhoff",
"target": "https?://www\\.taz\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-26 17:18:22"
}
/*
taz.de Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
This site is rather heterogenous when it comes to where the author is and all that.
Whenever the script doesn't find something it just returns an empty field.
Try on:
http://www.taz.de/
http://www.taz.de/1/archiv/detailsuche/?tx_hptazsearch_pi1[search_term]=Krise&tx_hptazsearch_pi2[submit_button].x=0&tx_hptazsearch_pi2[submit_button].y=0
http://www.taz.de/1/debatte/kolumnen/artikel/1/haengt-sie-hoeher-1/
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var taz_ArticleTitle_XPath = ".//h1";
var taz_Multiple_XPath = ".//*[@id='hauptspalte']/div/ul/li/a/h3";
var taz_Search_XPath = ".//*[@id='hauptspalte']/div/div/ul/li/a/h3";
if (doc.evaluate(taz_ArticleTitle_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.evaluate(taz_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
} else if (doc.evaluate(taz_Search_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
}
}
function authorCase(author) { // Turns All-Uppercase-Authors to normally cased Authors
var words = author.split(/\s|-/);
var authorFixed = '';
for (var i in words) {
words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
authorFixed = authorFixed + words[i] + ' ';
}
return(authorFixed);
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title!
var title_XPath = '//title';
var title = doc.evaluate(title_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.title = title.split(" - ")[0];
// Summary
var description_XPath = '//meta[contains(@name, "description")]';
var description = doc.evaluate(description_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
summary = description.replace(/\sVON.*$/g, '');
newItem.abstractNote = summary.replace(/KOMMENTAR|KOLUMNE.*$/g, '');
// Authors
var author_XPath = "//*[contains(@class, 'sectbody')]/*/span[contains(@class, 'author')]";
if (doc.evaluate(author_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) {
var author = doc.evaluate(author_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
} else if (description.match(/^(KOMMENTAR)|(KOLUMNE)\sVON/)){
Zotero.debug(description);
author = description.replace(/^(KOMMENTAR)|(KOLUMNE)\sVON\s/, '');
} else {
var author = "";
}
author = author.replace(/^\s*|\s*$/g, '');
author = author.replace(".", ". "); // in case a space is missing.
author = author.replace("VON ", '');
author = author.replace(/\s+/g, ' ');
author = author.split(/\sund\s|\su\.\s|\,\s|\&/);
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
author[i] = author[i].replace(/^\s*|\s*$/g, '');
author[i] = authorCase(author[i]);
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// Section
var section_XPath = ".//*[contains(@class, 'selected')]/ul/li[contains(@class, 'selected')]";
if (doc.evaluate(section_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ) {
var section= doc.evaluate(section_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section = section;
}
// Date
var date_XPath = ".//div[contains(@class, 'secthead')]";
var date = doc.evaluate(date_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
date = date.replace(/^\s*|\s*$/g, '');
date = date.substr(0,10);
newItem.date = date;
newItem.attachments.push({url:doc.location.href, title:doc.title, mimeType:"text/html"});
newItem.publicationTitle = "die tageszeitung"
newItem.complete();
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var taz_Multiple_XPath = ".//*[@id='hauptspalte']/div/ul/li/a";
var taz_Search_XPath = ".//*[@id='hauptspalte']/div/div/ul/li/a";
if (doc.evaluate(taz_Multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var titles = doc.evaluate(taz_Multiple_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
} else if (doc.evaluate(taz_Search_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
var titles = doc.evaluate(taz_Search_XPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
}
var next_title;
while (next_title = titles.iterateNext()) {
items[next_title.href] = next_title.innerHTML;
items[next_title.href] = items[next_title.href].replace(/(\<h4.*?\>.*?\<\/h4\>\<h3.*?\>)(.*)\<\/h3\>.*/, '$2');
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
{
"translatorID": "f61beec2-1431-4218-a9d3-68063ede6ecd",
"label": "Welt Online",
"creator": "Martin Meyerhoff",
"target": "^http://www\\.welt\\.de",
"minVersion": "1.0",
"maxVersion": "",
"priority": 100,
"inRepository": "1",
"translatorType": 4,
"lastUpdated": "2011-03-29 18:43:49"
}
/*
Welt Online Translator
Copyright (C) 2011 Martin Meyerhoff
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
/*
"Multiple" doesn't work on the search pages, because that's another host. However, every other page does it:
http://www.welt.de/themen/Fukushima/
http://www.welt.de/wirtschaft/
http://www.welt.de/wirtschaft/article12962920/Krankenkassen-werfen-Aerzten-Gewinnstreben-vor.html
*/
function detectWeb(doc, url) {
// I use XPaths. Therefore, I need the following block.
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var welt_article_XPath = ".//meta[contains(@property, 'og:type')]";
var welt_multiple_XPath = ".//div[contains(@class, 'h2')]/a";
if (doc.evaluate(welt_article_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("newspaperArticle");
return "newspaperArticle";
} else if (doc.evaluate(welt_multiple_XPath, doc, null, XPathResult.ANY_TYPE, null).iterateNext() ){
Zotero.debug("multiple");
return "multiple";
}
}
function scrape(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var newItem = new Zotero.Item("newspaperArticle");
newItem.url = doc.location.href;
// This is for the title! Welt's titles are ok without their "supertitles". They seem to convey - nothing.
var xPath = ".//meta[contains(@property, 'og:title')]";
var title = doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
newItem.title = title;
// Authors
var xPath = ".//meta[contains(@name, 'author')]";
var author= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
if (author == "WELT ONLINE") {
author = "";
}
author = author.split(/\sund\s|\su\.\s|\,\s|\&|Und/);
for (var i in author) {
if (author[i].match(/\s/)) { // only names that contain a space!
author[i] = author[i].replace(/^\s*|\s*$/g, '');
newItem.creators.push(Zotero.Utilities.cleanAuthor(author[i], "author"));
}
}
// Summary
var xPath = '//meta[contains(@name, "description")]';
var summary = doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
newItem.abstractNote = summary;
// Tags
var xPath = '//meta[contains(@name, "keywords")]';
var tags= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().content;
tags = tags.split(/,\s/);
if (tags[0] != "" ) {
for (var i in tags) {
tags[i] = tags[i].replace(/^\s*|\s*$/g, '');
newItem.tags.push(tags[i]);
}
}
// Date
var xPath = ".//span[contains(@class, 'date')][last()]";
var date= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.date = date;
// Publikation (I can only distinguish some articles from Welt am Sonntag by their URL, otherwise its all mishmash)
if (doc.location.href.match(/.*wams_print.*/)) {
newItem.publicationTitle = "Welt am Sonntag";
} else {
newItem.publicationTitle = "Welt Online";
}
// Section
var xPath = ".//*[@id='mainNavi']/ul/li[contains(@class, 'menAc')]/a";
var section= doc.evaluate(xPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
newItem.section = section;
// Attachment
newItem.attachments.push({url:doc.location.href+"?print=true", title:doc.title, mimeType:"text/html"});
newItem.complete()
}
function doWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
var articles = new Array();
if (detectWeb(doc, url) == "multiple") {
var items = new Object();
var titles = doc.evaluate(".//div[contains(@class, 'h2')]/a", doc, nsResolver, XPathResult.ANY_TYPE, null);
var next_title;
while (next_title = titles.iterateNext()) {
items[next_title.href] = next_title.textContent;
}
items = Zotero.selectItems(items);
for (var i in items) {
articles.push(i);
}
} else {
articles = [url];
}
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
Zotero.wait();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment