Skip to content

Instantly share code, notes, and snippets.

@ForbesLindesay
Last active April 18, 2017 06:54
Show Gist options
  • Select an option

  • Save ForbesLindesay/5350809 to your computer and use it in GitHub Desktop.

Select an option

Save ForbesLindesay/5350809 to your computer and use it in GitHub Desktop.
Parse schema.org style microdata
var request = require('request');
//Decode HTML entities:
var decode = require('ent').decode;
//todo: fix this
var ISO8601 = undefined;
var htmlparser = require('htmlparser');
function parse(domstr) {
var handler = new htmlparser.DefaultHandler();
var parser = new htmlparser.Parser(handler);
parser.parseComplete(domstr);
return handler.dom;
}
function treeMap(dom, fn) {
if (dom) {
return dom.map(function (node) {
return fn(node, function () { return treeMap(node.children, fn); });
})
.filter(Boolean)
.reduce(function (acc, temp) {
return Array.isArray(temp) ? acc.concat(temp) : acc.concat([temp]);
}, []);
} else {
return [];
}
}
var util = require('util');
function debug(obj) {
console.log(util.inspect(obj, false, 10, true));
}
request.get('http://blog.schema.org/', function (err, res, body) {
if (err) throw err;
if (res.statusCode !== 200) throw new Error('Server responded with ' + res.statusCode);
var dom = parse(body.toString('utf8'));
debug(toContent(treeMap(dom, select)));
});
function toContent(children) {
var result = {};
children.forEach(function (item) {
if (item.itemtype) {
item.content.itemtype = item.itemtype;
}
(item.itemprop || 'root').split(' ').forEach(function (prop) {
if (Array.isArray(result[prop])) {
result[prop].push(item.content);
} else if (result[prop]) {
result[prop] = [result[prop], item.content];
} else {
result[prop] = item.content;
}
});
});
return result;
}
function select(node, children) {
if (node.attribs && node.attribs.itemscope === 'itemscope') {
var result = {};
result.itemtype = node.attribs.itemtype;
if (node.attribs.itemprop) result.itemprop = node.attribs.itemprop;
result.content = toContent(children());
return result;
} else if (node.attribs && node.attribs.itemprop) {
var result = {};
result.itemprop = node.attribs.itemprop;
if (node.type === 'tag' && node.name === 'meta') {
result.content = node.attribs.content;
} else if (node.type === 'tag' && node.name === 'time') {
result.content = ISO8601.parse(node.attribs.datetime);
} else if (node.type === 'tag' && node.name === 'a') {
result.content = node.attribs.href;
} else {
result.content = { html: innerHTML(node), text: innerText(node) };
}
return result;
}
return children();
}
function innerHTML(node) {
if (!node.children) return '';
return node
.children
.map(function (node) {
if (node.type === 'text') return node.raw;
return '<' + node.raw + '>' + innerHTML(node) + '</' + node.name + '>';
})
.join('');
}
function innerText(node) {
if (node.type === 'text') return decode(node.raw);
if (!node.children) return '';
return node
.children
.map(innerText)
.join('');
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment