Skip to content

Instantly share code, notes, and snippets.

@s3thi
Created August 16, 2012 13:01
Show Gist options
  • Save s3thi/3369950 to your computer and use it in GitHub Desktop.
Save s3thi/3369950 to your computer and use it in GitHub Desktop.
Scrape Premchand
"use strict;";
// You need htmlparser, soupselect and ent to run this.
// Install them from npm.
var http = require("http");
var fs = require("fs");
var path = require("path");
var htmlparser = require("htmlparser");
var select = require("soupselect").select;
var ent = require("ent");
var STORIES_ROOT = "http://premchand.kahaani.org/2009/09/contents.html";
var DELAY = 3 * 1000;
var SAVE_DIR = "stories";
function trim(s) {
s = s.replace(/(^\s*)|(\s*$)/gi,"");
s = s.replace(/[ ]{2,}/gi," ");
s = s.replace(/\n /,"\n");
return s;
}
function get_toc_html(cb) {
var toc_html = "";
http.get(STORIES_ROOT, function(response) {
if (response.statusCode === 200) {
response.on("data", function(chunk) {
toc_html += chunk;
});
response.on("end", function() {
cb(toc_html);
});
} else {
throw new Error("Could not download TOC page. Error " + response.statusCode);
}
});
}
function parse_html(toc_html, cb) {
var handler = new htmlparser.DefaultHandler(function(error, dom) {
if (error) {
throw new Error("Could not parse TOC DOM.");
} else {
cb(dom);
}
});
var parser = new htmlparser.Parser(handler);
parser.parseComplete(toc_html);
}
function extract_toc_items_from_dom_as_array(dom) {
var toc_items = [];
var children = select(dom, "div.post-body table tbody tr td ol li a");
for (var i = 0; i < children.length; i++) {
toc_items.push({
"story_name": ent.decode(children[i].children[0].data),
"url": children[i].attribs.href
});
}
return toc_items;
}
function print_toc() {
get_toc_html(function(toc_html) {
parse_html(toc_html, function(dom) {
toc_items = extract_toc_items_from_dom_as_array(dom);
for (var i = 0; i < toc_items.length; i++) {
console.log("Story: " + toc_items[i].story_name + ", URL: " + toc_items[i].url);
}
});
});
}
function download_all_stories() {
get_toc_html(function(toc_html) {
parse_html(toc_html, function(dom) {
save_stories_to_disk(extract_toc_items_from_dom_as_array(dom));
});
});
}
function save_stories_to_disk(toc_items) {
var story_counter = 0;
var story;
function download_next_story() {
story = toc_items[story_counter];
http.get(story.url, function(response) {
var story_body;
if (response.statusCode === 200) {
response.on("data", function(chunk) {
story_body += chunk;
});
response.on("end", function() {
parse_story_body(story_body);
});
}
});
story_counter++;
}
function parse_story_body(story_body) {
parse_html(story_body, function(dom) {
var title_node = select(dom, "h3.post-title.entry-title");
var title = ent.decode(title_node[0].children[0].data);
title = trim(title);
var content_node = select(dom, "div.post-body.entry-content");
var children = content_node[0].children;
var content = "", temp;
for (var i = 0; i < children.length; i++) {
if (children[i].type === "text") {
temp = ent.decode(children[i].data);
temp = trim(temp);
content += "\n\n" + temp;
}
}
write_to_disk(title, content);
});
}
function write_to_disk(title, content) {
if (!fs.existsSync(SAVE_DIR)) {
fs.mkdirSync(SAVE_DIR);
}
fs.writeFile(path.join(SAVE_DIR, title + ".txt"), content, function(err) {
if (err) {
console.log("Could not save " + title);
} else {
console.log("Saved " + title)
}
});
}
setInterval(download_next_story, DELAY);
}
function main() {
download_all_stories();
}
if (require.main === module) {
main();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment