Skip to content

Instantly share code, notes, and snippets.

@fero23
Last active February 1, 2016 04:44
Show Gist options
  • Save fero23/dc104f6deb891559b8f6 to your computer and use it in GitHub Desktop.
Save fero23/dc104f6deb891559b8f6 to your computer and use it in GitHub Desktop.
Random RSS blog feed compilator for my web novel needs
"use strict";
var cheerio = require("cheerio");
var request = require("request");
var _ = require("lodash");
var parseString = require('xml2js').parseString;
var ProgressWatcher = require("./progress_watcher");
let urls = {
drupal: [
"http://skythewoodtl.com"
],
wordpress: [
"https://shintranslations.com",
"http://circustranslations.com",
"https://isekailunatic.wordpress.com",
"http://arkmachinetranslations.com",
"http://trinityarchive.com",
"https://infinitenoveltranslations.wordpress.com",
"http://japtem.com"
],
blogspot: [
"http://www.sousetsuka.com",
"http://thelordofpie.blogspot.com"
]
}
let bannedArticles = [
"Welcome to Amsterdam",
"Schedule for TNG"
]
module.exports = function (req, res, next) {
let urlCount = 0;
for (let prop in urls) {
urlCount += urls[prop].length;
}
let pw = new ProgressWatcher(urlCount);
for (let prop in urls) {
switch (prop) {
case "wordpress":
urls[prop].forEach(url => new RSSScrapper(url + "/feed").scrap(pw));
break;
case "blogspot":
urls[prop].forEach(url => new AtomScrapper(url + "/feeds/posts/default").scrap(pw));
break;
case "drupal":
urls[prop].forEach(url => new RSSScrapper(url + "/rss.xml").scrap(pw));
break;
}
}
pw.onComplete(articles => {
let sorted = _(articles).chain().flatten().sortBy("pubDate").reverse().value();
if (_.some(sorted, article => article.pubDate.toString == "Invalid Date")) {
console.warn("Some articles have problems with dates");
}
res.set('Content-Type', 'text/xml').render("rss", {
articles: sorted
});
});
}
function printError(url, err) {
console.error("Error on " + url + ":\n" + err.message);
}
class RSSScrapper {
constructor(url) {
this.url = url;
}
scrap(pw) {
request(this.url, (err, res, html) => {
if (err) {
printError(this.url, err);
pw.progress([]);
}
else {
parseString(html, (err, result) => {
try {
pw.progress(result.rss.channel[0].item.map(item => {
if (_.some(bannedArticles, banned => banned == item.title[0])) {
return null;
}
return {
title: item.title[0],
link: item.link[0],
description: item.description[0],
pubDate: new Date(item.pubDate[0])
};
}).filter(article => article != null));
}
catch (err) {
printError(this.url, err);
pw.progress([]);
}
});
}
});
}
}
class AtomScrapper {
constructor(url) {
this.url = url;
}
scrap(pw) {
request(this.url, (err, res, html) => {
if (err) {
printError(this.url, err);
pw.progress([]);
}
else {
parseString(html, (err, result) => {
try {
pw.progress(result.feed.entry.map(item => {
if (_.some(bannedArticles, banned => banned == item.title[0])) {
return null;
}
return {
title: item.title[0]._,
link: item.link.filter(l => l.$.type == "text/html")[0].$.href,
description: _(cheerio.load("<html><body>" +
item.content[0]._ + "</body></html>")("body")
.text()).chain().words(/[^\s]+/g).take(100).value().join(" ") + " [...]",
pubDate: new Date(item.published[0])
};
}).filter(article => article != null));
}
catch (err) {
printError(this.url, err);
pw.progress([]);
}
});
}
});
}
}
"use strict";
module.exports = class {
constructor(count) {
this.count = count;
this.current = 0;
this.results = [];
this.onCompleteCbArr = [];
this.onErrorCbArr = [];
this.finallyCbArr = [];
}
progress(obj) {
if (obj instanceof Error) {
this.error = obj;
}
else {
this.results.push(obj);
}
if (!this.error) {
if (this.error) {
this.onErrorCbArr.forEach(cb => cb(this.error));
this.finallyCbArr.forEach(cb => cb());
}
else if (++this.current == this.count) {
this.onCompleteCbArr.forEach(cb => cb(this.results));
this.finallyCbArr.forEach(cb => cb());
}
}
}
hasError() {
return this.error !== null && this.error !== undefined;
}
onError(cb) {
this.onErrorCbArr.push(cb);
}
onComplete(cb) {
this.onCompleteCbArr.push(cb);
}
finally(cb) {
this.finallyCbArr.push(cb);
}
}
doctype xml
rss(version='2.0')
channel
title Novel RSS Compilator
description Compilator of the translator websites' RSS feed of my favorite novels.
each article in articles
item
title= article.title
link= article.link
description= article.description
pubDate= article.pubDate.toUTCString()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment