fero23 · February 1, 2016 04:44
diff --git a/novel_scrapper.js b/novel_scrapper.js
 "use strict";

 var cheerio = require("cheerio");
 var request = require("request");
 var _ = require("lodash");
 var parseString = require('xml2js').parseString;
 var ProgressWatcher = require("./progress_watcher");

 let urls = {
    drupal: [
        "http://skythewoodtl.com"
    ],
    wordpress: [
        "https://shintranslations.com",
        "http://circustranslations.com",
        "https://isekailunatic.wordpress.com",
        "http://arkmachinetranslations.com",
        "http://trinityarchive.com",
        "https://infinitenoveltranslations.wordpress.com",
        "http://japtem.com"
    ],
    blogspot: [
        "http://www.sousetsuka.com",
        "http://thelordofpie.blogspot.com"
    ]
 }

 let bannedArticles = [
    "Welcome to Amsterdam",
    "Schedule for TNG"
 ]

 module.exports = function (req, res, next) {
    let urlCount = 0;
    for (let prop in urls) {
        urlCount += urls[prop].length;
    }

    let pw = new ProgressWatcher(urlCount);

    for (let prop in urls) {
        switch (prop) {
            case "wordpress":
                urls[prop].forEach(url => new RSSScrapper(url + "/feed").scrap(pw));
                break;
            case "blogspot":
                urls[prop].forEach(url => new AtomScrapper(url + "/feeds/posts/default").scrap(pw));
                break;
            case "drupal":
                urls[prop].forEach(url => new RSSScrapper(url + "/rss.xml").scrap(pw));
                break;
        }
    }

    pw.onComplete(articles => {
        let sorted = _(articles).chain().flatten().sortBy("pubDate").reverse().value();
        if (_.some(sorted, article => article.pubDate.toString == "Invalid Date")) {
            console.warn("Some articles have problems with dates");
        }
        res.set('Content-Type', 'text/xml').render("rss", {
            articles: sorted
        });
    });
 }

 function printError(url, err) {
    console.error("Error on " + url + ":\n" + err.message);
 }

 class RSSScrapper {
    constructor(url) {
        this.url = url;
    }

    scrap(pw) {
        request(this.url, (err, res, html) => {
            if (err) {
                printError(this.url, err);
                pw.progress([]);
            }
            else {
                parseString(html, (err, result) => {
                    try {
                        pw.progress(result.rss.channel[0].item.map(item => {
                            if (_.some(bannedArticles, banned => banned == item.title[0])) {
                                return null;
                            }
                            return {
                                title: item.title[0],
                                link: item.link[0],
                                description: item.description[0],
                                pubDate: new Date(item.pubDate[0])
                            };
                        }).filter(article => article != null));
                    }
                    catch (err) {
                        printError(this.url, err);
                        pw.progress([]);
                    }
                });
            }
        });
    }
 }

 class AtomScrapper {
    constructor(url) {
        this.url = url;
    }

    scrap(pw) {
        request(this.url, (err, res, html) => {
            if (err) {
                printError(this.url, err);
                pw.progress([]);
            }
            else {
                parseString(html, (err, result) => {
                    try {
                        pw.progress(result.feed.entry.map(item => {
                            if (_.some(bannedArticles, banned => banned == item.title[0])) {
                                return null;
                            }

                            return {
                                title: item.title[0]._,
                                link: item.link.filter(l => l.$.type == "text/html")[0].$.href,
                                description: _(cheerio.load("<html><body>" +
                                    item.content[0]._ + "</body></html>")("body")
                                    .text()).chain().words(/[^\s]+/g).take(100).value().join(" ") + " [...]",
                                pubDate: new Date(item.published[0])
                            };
                        }).filter(article => article != null));
                    }
                    catch (err) {
                        printError(this.url, err);
                        pw.progress([]);
                    }
                });
            }
        });
    }
 }
diff --git a/progress_watcher.js b/progress_watcher.js
 "use strict";

 module.exports = class {
    constructor(count) {
        this.count = count;
        this.current = 0;
        this.results = [];
        this.onCompleteCbArr = [];
        this.onErrorCbArr = [];
        this.finallyCbArr = [];
    }

    progress(obj) {
        if (obj instanceof Error) {
            this.error = obj;
        }
        else {
            this.results.push(obj);
        }
        if (!this.error) {
            if (this.error) {
                this.onErrorCbArr.forEach(cb => cb(this.error));
                this.finallyCbArr.forEach(cb => cb());
            }
            else if (++this.current == this.count) {
                this.onCompleteCbArr.forEach(cb => cb(this.results));
                this.finallyCbArr.forEach(cb => cb());
            }
        }
    }

    hasError() {
        return this.error !== null && this.error !== undefined;
    }

    onError(cb) {
        this.onErrorCbArr.push(cb);
    }

    onComplete(cb) {
        this.onCompleteCbArr.push(cb);
    }

    finally(cb) {
        this.finallyCbArr.push(cb);
    }
 }
diff --git a/rss.jade b/rss.jade
 doctype xml
 rss(version='2.0')
    channel
        title Novel RSS Compilator
        description Compilator of the translator websites' RSS feed of my favorite novels.
        each article in articles
            item
                title= article.title
                link= article.link
                description= article.description
                pubDate= article.pubDate.toUTCString()
	"use strict";

	var cheerio = require("cheerio");
	var request = require("request");
	var _ = require("lodash");
	var parseString = require('xml2js').parseString;
	var ProgressWatcher = require("./progress_watcher");

	let urls = {
	drupal: [
	"http://skythewoodtl.com"
	],
	wordpress: [
	"https://shintranslations.com",
	"http://circustranslations.com",
	"https://isekailunatic.wordpress.com",
	"http://arkmachinetranslations.com",
	"http://trinityarchive.com",
	"https://infinitenoveltranslations.wordpress.com",
	"http://japtem.com"
	],
	blogspot: [
	"http://www.sousetsuka.com",
	"http://thelordofpie.blogspot.com"
	]
	}

	let bannedArticles = [
	"Welcome to Amsterdam",
	"Schedule for TNG"
	]

	module.exports = function (req, res, next) {
	let urlCount = 0;
	for (let prop in urls) {
	urlCount += urls[prop].length;
	}

	let pw = new ProgressWatcher(urlCount);

	for (let prop in urls) {
	switch (prop) {
	case "wordpress":
	urls[prop].forEach(url => new RSSScrapper(url + "/feed").scrap(pw));
	break;
	case "blogspot":
	urls[prop].forEach(url => new AtomScrapper(url + "/feeds/posts/default").scrap(pw));
	break;
	case "drupal":
	urls[prop].forEach(url => new RSSScrapper(url + "/rss.xml").scrap(pw));
	break;
	}
	}

	pw.onComplete(articles => {
	let sorted = _(articles).chain().flatten().sortBy("pubDate").reverse().value();
	if (_.some(sorted, article => article.pubDate.toString == "Invalid Date")) {
	console.warn("Some articles have problems with dates");
	}
	res.set('Content-Type', 'text/xml').render("rss", {
	articles: sorted
	});
	});
	}

	function printError(url, err) {
	console.error("Error on " + url + ":\n" + err.message);
	}

	class RSSScrapper {
	constructor(url) {
	this.url = url;
	}

	scrap(pw) {
	request(this.url, (err, res, html) => {
	if (err) {
	printError(this.url, err);
	pw.progress([]);
	}
	else {
	parseString(html, (err, result) => {
	try {
	pw.progress(result.rss.channel[0].item.map(item => {
	if (_.some(bannedArticles, banned => banned == item.title[0])) {
	return null;
	}
	return {
	title: item.title[0],
	link: item.link[0],
	description: item.description[0],
	pubDate: new Date(item.pubDate[0])
	};
	}).filter(article => article != null));
	}
	catch (err) {
	printError(this.url, err);
	pw.progress([]);
	}
	});
	}
	});
	}
	}

	class AtomScrapper {
	constructor(url) {
	this.url = url;
	}

	scrap(pw) {
	request(this.url, (err, res, html) => {
	if (err) {
	printError(this.url, err);
	pw.progress([]);
	}
	else {
	parseString(html, (err, result) => {
	try {
	pw.progress(result.feed.entry.map(item => {
	if (_.some(bannedArticles, banned => banned == item.title[0])) {
	return null;
	}

	return {
	title: item.title[0]._,
	link: item.link.filter(l => l.$.type == "text/html")[0].$.href,
	description: _(cheerio.load("<html><body>" +
	item.content[0]._ + "</body></html>")("body")
	.text()).chain().words(/[^\s]+/g).take(100).value().join(" ") + " [...]",
	pubDate: new Date(item.published[0])
	};
	}).filter(article => article != null));
	}
	catch (err) {
	printError(this.url, err);
	pw.progress([]);
	}
	});
	}
	});
	}
	}
	"use strict";

	module.exports = class {
	constructor(count) {
	this.count = count;
	this.current = 0;
	this.results = [];
	this.onCompleteCbArr = [];
	this.onErrorCbArr = [];
	this.finallyCbArr = [];
	}

	progress(obj) {
	if (obj instanceof Error) {
	this.error = obj;
	}
	else {
	this.results.push(obj);
	}
	if (!this.error) {
	if (this.error) {
	this.onErrorCbArr.forEach(cb => cb(this.error));
	this.finallyCbArr.forEach(cb => cb());
	}
	else if (++this.current == this.count) {
	this.onCompleteCbArr.forEach(cb => cb(this.results));
	this.finallyCbArr.forEach(cb => cb());
	}
	}
	}

	hasError() {
	return this.error !== null && this.error !== undefined;
	}

	onError(cb) {
	this.onErrorCbArr.push(cb);
	}

	onComplete(cb) {
	this.onCompleteCbArr.push(cb);
	}

	finally(cb) {
	this.finallyCbArr.push(cb);
	}
	}
	doctype xml
	rss(version='2.0')
	channel
	title Novel RSS Compilator
	description Compilator of the translator websites' RSS feed of my favorite novels.
	each article in articles
	item
	title= article.title
	link= article.link
	description= article.description
	pubDate= article.pubDate.toUTCString()