Created
August 16, 2012 13:01
-
-
Save s3thi/3369950 to your computer and use it in GitHub Desktop.
Scrape Premchand
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict;"; | |
// You need htmlparser, soupselect and ent to run this. | |
// Install them from npm. | |
var http = require("http"); | |
var fs = require("fs"); | |
var path = require("path"); | |
var htmlparser = require("htmlparser"); | |
var select = require("soupselect").select; | |
var ent = require("ent"); | |
var STORIES_ROOT = "http://premchand.kahaani.org/2009/09/contents.html"; | |
var DELAY = 3 * 1000; | |
var SAVE_DIR = "stories"; | |
function trim(s) { | |
s = s.replace(/(^\s*)|(\s*$)/gi,""); | |
s = s.replace(/[ ]{2,}/gi," "); | |
s = s.replace(/\n /,"\n"); | |
return s; | |
} | |
function get_toc_html(cb) { | |
var toc_html = ""; | |
http.get(STORIES_ROOT, function(response) { | |
if (response.statusCode === 200) { | |
response.on("data", function(chunk) { | |
toc_html += chunk; | |
}); | |
response.on("end", function() { | |
cb(toc_html); | |
}); | |
} else { | |
throw new Error("Could not download TOC page. Error " + response.statusCode); | |
} | |
}); | |
} | |
function parse_html(toc_html, cb) { | |
var handler = new htmlparser.DefaultHandler(function(error, dom) { | |
if (error) { | |
throw new Error("Could not parse TOC DOM."); | |
} else { | |
cb(dom); | |
} | |
}); | |
var parser = new htmlparser.Parser(handler); | |
parser.parseComplete(toc_html); | |
} | |
function extract_toc_items_from_dom_as_array(dom) { | |
var toc_items = []; | |
var children = select(dom, "div.post-body table tbody tr td ol li a"); | |
for (var i = 0; i < children.length; i++) { | |
toc_items.push({ | |
"story_name": ent.decode(children[i].children[0].data), | |
"url": children[i].attribs.href | |
}); | |
} | |
return toc_items; | |
} | |
function print_toc() { | |
get_toc_html(function(toc_html) { | |
parse_html(toc_html, function(dom) { | |
toc_items = extract_toc_items_from_dom_as_array(dom); | |
for (var i = 0; i < toc_items.length; i++) { | |
console.log("Story: " + toc_items[i].story_name + ", URL: " + toc_items[i].url); | |
} | |
}); | |
}); | |
} | |
function download_all_stories() { | |
get_toc_html(function(toc_html) { | |
parse_html(toc_html, function(dom) { | |
save_stories_to_disk(extract_toc_items_from_dom_as_array(dom)); | |
}); | |
}); | |
} | |
function save_stories_to_disk(toc_items) { | |
var story_counter = 0; | |
var story; | |
function download_next_story() { | |
story = toc_items[story_counter]; | |
http.get(story.url, function(response) { | |
var story_body; | |
if (response.statusCode === 200) { | |
response.on("data", function(chunk) { | |
story_body += chunk; | |
}); | |
response.on("end", function() { | |
parse_story_body(story_body); | |
}); | |
} | |
}); | |
story_counter++; | |
} | |
function parse_story_body(story_body) { | |
parse_html(story_body, function(dom) { | |
var title_node = select(dom, "h3.post-title.entry-title"); | |
var title = ent.decode(title_node[0].children[0].data); | |
title = trim(title); | |
var content_node = select(dom, "div.post-body.entry-content"); | |
var children = content_node[0].children; | |
var content = "", temp; | |
for (var i = 0; i < children.length; i++) { | |
if (children[i].type === "text") { | |
temp = ent.decode(children[i].data); | |
temp = trim(temp); | |
content += "\n\n" + temp; | |
} | |
} | |
write_to_disk(title, content); | |
}); | |
} | |
function write_to_disk(title, content) { | |
if (!fs.existsSync(SAVE_DIR)) { | |
fs.mkdirSync(SAVE_DIR); | |
} | |
fs.writeFile(path.join(SAVE_DIR, title + ".txt"), content, function(err) { | |
if (err) { | |
console.log("Could not save " + title); | |
} else { | |
console.log("Saved " + title) | |
} | |
}); | |
} | |
setInterval(download_next_story, DELAY); | |
} | |
function main() { | |
download_all_stories(); | |
} | |
if (require.main === module) { | |
main(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment