Created
February 25, 2014 14:31
-
-
Save kawanet/9209901 to your computer and use it in GitHub Desktop.
Fetch and parse HTML as a JSON representation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Fetch and parse HTML as a JSON representation | |
| * @license MIT | |
| * @author kawanet | |
| * @example | |
| * var parser = require("./html_parser"); | |
| * var url = "https://github.com/kawanet"; | |
| * parser(url, function(err, data) { | |
| * if (err) return console.error(err); | |
| * console.log(JSON.stringify(data, null, " ")); | |
| * }); | |
| */ | |
| var request = require("request"); | |
| var cheerio = require("cheerio"); | |
| var Iconv = require("iconv").Iconv; | |
| var URL = require("url"); | |
| var iconv_cache = {}; | |
| module.exports = html_parser; | |
| function html_parser(url, callback) { | |
| callback = callback || NOP; | |
| // request object | |
| var req; | |
| if ("string" === typeof url) { | |
| req = {}; | |
| req.url = url; | |
| } else { | |
| req = clone(url); | |
| } | |
| // binary mode | |
| if ("undefined" === typeof req.encoding) { | |
| req.encoding = null; | |
| } | |
| // proxy | |
| if (!req.proxy) req.proxy = process.env.NODE_HTTP_PROXY; | |
| // send request | |
| request.get(req, then); | |
| function then(err, res, body) { | |
| if (err) return callback(err); | |
| // HTTP response header | |
| var headers = res.headers; | |
| var ctype = headers && headers["content-type"]; | |
| var charset = ctype && contenttype2charset(ctype); | |
| parse(body, charset); | |
| } | |
| function parse(body, charset) { | |
| // body is a Buffer per default | |
| var converted; | |
| if (charset && charset.search(/^utf-?8/i) < 0) { | |
| var utf8 = "UTF-8//TRANSLIT//IGNORE"; | |
| var iconv = iconv_cache[charset]; | |
| try { | |
| if (!iconv) iconv = iconv_cache[charset] = new Iconv(charset, utf8); | |
| body = iconv.convert(body); | |
| converted = true; | |
| } catch (e) { | |
| return callback(e); | |
| } | |
| } | |
| // parse html | |
| var $ = cheerio.load(body); | |
| if (!charset) { | |
| // <meta charset="Shift_JIS"> | |
| $("meta").each(function(idx, meta) { | |
| var $meta = $(meta); | |
| var c = $meta.attr("charset"); | |
| if (c) charset = c; | |
| }); | |
| // <meta http-equiv="content-type" content="text/html; charset=euc-jp"> | |
| var ctype = $("meta[http-equiv='content-type']").attr("content"); | |
| charset = ctype && contenttype2charset(ctype); | |
| if (!converted && charset && charset.search(/^utf-?8/i) < 0) { | |
| return parse(body, charset); | |
| } | |
| } | |
| var result = {}; | |
| var meta = {}; | |
| var og = {}; | |
| var dc = {}; | |
| // fetch elements | |
| result.url = req.url; | |
| result.title = $("title").text(); | |
| result.charset = charset; | |
| meta.description = $("meta[name='description']").attr("content"); | |
| meta.refresh = $("meta[name='refresh']").attr("content"); | |
| meta.keywords = $("meta[name='keywords']").attr("content"); | |
| meta.canonical = $("link[rel='canonical']").attr("href"); | |
| // RSS | |
| var alternative = []; | |
| $("link[rel='alternate']").each(function(idx, elem) { | |
| var $elem = $(elem); | |
| var hash = {}; | |
| hash.type = $elem.attr("type"); | |
| hash.media = $elem.attr("media"); | |
| hash.href = $elem.attr("href"); | |
| if (hash.href) hash.href = URL.resolve(req.url, hash.href); | |
| hash.title = $elem.attr("title"); | |
| alternative.push(trim(hash)); | |
| }); | |
| if (alternative.length) result.alternative = alternative; | |
| // favicon | |
| var rel_icon = { | |
| "icon": true, | |
| "shortcut icon": true, | |
| "apple-touch-icon": true, | |
| "apple-touch-icon-precomposed": true | |
| }; | |
| var icon = []; | |
| $("link").each(function(idx, elem) { | |
| var $elem = $(elem); | |
| var rel = $elem.attr("rel"); | |
| if (!rel) return; | |
| if (!rel_icon[rel.toLowerCase()]) return; | |
| var hash = {}; | |
| hash.rel = rel; | |
| hash.type = $elem.attr("type"); | |
| hash.href = $elem.attr("href"); | |
| if (hash.href) hash.href = URL.resolve(req.url, hash.href); | |
| icon.push(trim(hash)); | |
| }); | |
| if (icon.length) result.icon = icon; | |
| // Open Graph | |
| $("meta[property^='og:']").each(function(idx, elem) { | |
| var $elem = $(elem); | |
| var key = $elem.attr("property").substr(3); | |
| og[key] = $elem.attr("content"); | |
| }); | |
| // Dublin Core | |
| $("meta[name^='DC.']").each(function(idx, elem) { | |
| var $elem = $(elem); | |
| var key = $elem.attr("name").substr(3); | |
| dc[key] = $elem.attr("content"); | |
| }); | |
| // build result | |
| result = trim(result); | |
| meta = trim(meta); | |
| og = trim(og); | |
| dc = trim(dc); | |
| if (Object.keys(meta).length) result.meta = meta; | |
| if (Object.keys(dc).length) result.dc = dc; | |
| if (Object.keys(og).length) result.og = og; | |
| // complete | |
| callback(null, result); | |
| } | |
| }; | |
| function trim(obj) { | |
| var key; | |
| var keys = {}; | |
| for (key in obj) { | |
| if (obj[key] == null || obj[key] === "") keys[key] = true; | |
| } | |
| for (key in keys) { | |
| delete obj[key]; | |
| } | |
| return obj; | |
| } | |
| function NOP() { | |
| } | |
| function contenttype2charset(src) { | |
| if (!src) return; | |
| if (src.search(/;\s*charset=/i) < 0) return; | |
| return src.replace(/^.*;\s*charset=[\s\'\"]*|[\s\'\"]+$/i, ""); | |
| } | |
| function clone(parent) { | |
| var obj = {}; | |
| for (var key in parent) { | |
| obj[key] = parent[key]; | |
| } | |
| return obj; | |
| } | |
| // CLI for testing | |
| if (!module.parent) { | |
| var list = process.argv.slice(2); | |
| list.forEach(function(url) { | |
| html_parser(url, function(err, res) { | |
| if (err) return console.error(err); | |
| console.log(JSON.stringify(res, null, " ")); | |
| }); | |
| }); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment