Skip to content

Instantly share code, notes, and snippets.

@kawanet
Created February 25, 2014 14:31
Show Gist options
  • Select an option

  • Save kawanet/9209901 to your computer and use it in GitHub Desktop.

Select an option

Save kawanet/9209901 to your computer and use it in GitHub Desktop.
Fetch and parse HTML as a JSON representation
/**
* Fetch and parse HTML as a JSON representation
* @license MIT
* @author kawanet
* @example
* var parser = require("./html_parser");
* var url = "https://github.com/kawanet";
* parser(url, function(err, data) {
* if (err) return console.error(err);
* console.log(JSON.stringify(data, null, " "));
* });
*/
var request = require("request");
var cheerio = require("cheerio");
var Iconv = require("iconv").Iconv;
var URL = require("url");
var iconv_cache = {};
module.exports = html_parser;
function html_parser(url, callback) {
callback = callback || NOP;
// request object
var req;
if ("string" === typeof url) {
req = {};
req.url = url;
} else {
req = clone(url);
}
// binary mode
if ("undefined" === typeof req.encoding) {
req.encoding = null;
}
// proxy
if (!req.proxy) req.proxy = process.env.NODE_HTTP_PROXY;
// send request
request.get(req, then);
function then(err, res, body) {
if (err) return callback(err);
// HTTP response header
var headers = res.headers;
var ctype = headers && headers["content-type"];
var charset = ctype && contenttype2charset(ctype);
parse(body, charset);
}
function parse(body, charset) {
// body is a Buffer per default
var converted;
if (charset && charset.search(/^utf-?8/i) < 0) {
var utf8 = "UTF-8//TRANSLIT//IGNORE";
var iconv = iconv_cache[charset];
try {
if (!iconv) iconv = iconv_cache[charset] = new Iconv(charset, utf8);
body = iconv.convert(body);
converted = true;
} catch (e) {
return callback(e);
}
}
// parse html
var $ = cheerio.load(body);
if (!charset) {
// <meta charset="Shift_JIS">
$("meta").each(function(idx, meta) {
var $meta = $(meta);
var c = $meta.attr("charset");
if (c) charset = c;
});
// <meta http-equiv="content-type" content="text/html; charset=euc-jp">
var ctype = $("meta[http-equiv='content-type']").attr("content");
charset = ctype && contenttype2charset(ctype);
if (!converted && charset && charset.search(/^utf-?8/i) < 0) {
return parse(body, charset);
}
}
var result = {};
var meta = {};
var og = {};
var dc = {};
// fetch elements
result.url = req.url;
result.title = $("title").text();
result.charset = charset;
meta.description = $("meta[name='description']").attr("content");
meta.refresh = $("meta[name='refresh']").attr("content");
meta.keywords = $("meta[name='keywords']").attr("content");
meta.canonical = $("link[rel='canonical']").attr("href");
// RSS
var alternative = [];
$("link[rel='alternate']").each(function(idx, elem) {
var $elem = $(elem);
var hash = {};
hash.type = $elem.attr("type");
hash.media = $elem.attr("media");
hash.href = $elem.attr("href");
if (hash.href) hash.href = URL.resolve(req.url, hash.href);
hash.title = $elem.attr("title");
alternative.push(trim(hash));
});
if (alternative.length) result.alternative = alternative;
// favicon
var rel_icon = {
"icon": true,
"shortcut icon": true,
"apple-touch-icon": true,
"apple-touch-icon-precomposed": true
};
var icon = [];
$("link").each(function(idx, elem) {
var $elem = $(elem);
var rel = $elem.attr("rel");
if (!rel) return;
if (!rel_icon[rel.toLowerCase()]) return;
var hash = {};
hash.rel = rel;
hash.type = $elem.attr("type");
hash.href = $elem.attr("href");
if (hash.href) hash.href = URL.resolve(req.url, hash.href);
icon.push(trim(hash));
});
if (icon.length) result.icon = icon;
// Open Graph
$("meta[property^='og:']").each(function(idx, elem) {
var $elem = $(elem);
var key = $elem.attr("property").substr(3);
og[key] = $elem.attr("content");
});
// Dublin Core
$("meta[name^='DC.']").each(function(idx, elem) {
var $elem = $(elem);
var key = $elem.attr("name").substr(3);
dc[key] = $elem.attr("content");
});
// build result
result = trim(result);
meta = trim(meta);
og = trim(og);
dc = trim(dc);
if (Object.keys(meta).length) result.meta = meta;
if (Object.keys(dc).length) result.dc = dc;
if (Object.keys(og).length) result.og = og;
// complete
callback(null, result);
}
};
function trim(obj) {
var key;
var keys = {};
for (key in obj) {
if (obj[key] == null || obj[key] === "") keys[key] = true;
}
for (key in keys) {
delete obj[key];
}
return obj;
}
function NOP() {
}
function contenttype2charset(src) {
if (!src) return;
if (src.search(/;\s*charset=/i) < 0) return;
return src.replace(/^.*;\s*charset=[\s\'\"]*|[\s\'\"]+$/i, "");
}
function clone(parent) {
var obj = {};
for (var key in parent) {
obj[key] = parent[key];
}
return obj;
}
// CLI for testing
if (!module.parent) {
var list = process.argv.slice(2);
list.forEach(function(url) {
html_parser(url, function(err, res) {
if (err) return console.error(err);
console.log(JSON.stringify(res, null, " "));
});
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment