Last active
April 18, 2017 06:54
-
-
Save ForbesLindesay/5350809 to your computer and use it in GitHub Desktop.
Parse schema.org style microdata
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| var request = require('request'); | |
| //Decode HTML entities: | |
| var decode = require('ent').decode; | |
| //todo: fix this | |
| var ISO8601 = undefined; | |
| var htmlparser = require('htmlparser'); | |
| function parse(domstr) { | |
| var handler = new htmlparser.DefaultHandler(); | |
| var parser = new htmlparser.Parser(handler); | |
| parser.parseComplete(domstr); | |
| return handler.dom; | |
| } | |
| function treeMap(dom, fn) { | |
| if (dom) { | |
| return dom.map(function (node) { | |
| return fn(node, function () { return treeMap(node.children, fn); }); | |
| }) | |
| .filter(Boolean) | |
| .reduce(function (acc, temp) { | |
| return Array.isArray(temp) ? acc.concat(temp) : acc.concat([temp]); | |
| }, []); | |
| } else { | |
| return []; | |
| } | |
| } | |
| var util = require('util'); | |
| function debug(obj) { | |
| console.log(util.inspect(obj, false, 10, true)); | |
| } | |
| request.get('http://blog.schema.org/', function (err, res, body) { | |
| if (err) throw err; | |
| if (res.statusCode !== 200) throw new Error('Server responded with ' + res.statusCode); | |
| var dom = parse(body.toString('utf8')); | |
| debug(toContent(treeMap(dom, select))); | |
| }); | |
| function toContent(children) { | |
| var result = {}; | |
| children.forEach(function (item) { | |
| if (item.itemtype) { | |
| item.content.itemtype = item.itemtype; | |
| } | |
| (item.itemprop || 'root').split(' ').forEach(function (prop) { | |
| if (Array.isArray(result[prop])) { | |
| result[prop].push(item.content); | |
| } else if (result[prop]) { | |
| result[prop] = [result[prop], item.content]; | |
| } else { | |
| result[prop] = item.content; | |
| } | |
| }); | |
| }); | |
| return result; | |
| } | |
| function select(node, children) { | |
| if (node.attribs && node.attribs.itemscope === 'itemscope') { | |
| var result = {}; | |
| result.itemtype = node.attribs.itemtype; | |
| if (node.attribs.itemprop) result.itemprop = node.attribs.itemprop; | |
| result.content = toContent(children()); | |
| return result; | |
| } else if (node.attribs && node.attribs.itemprop) { | |
| var result = {}; | |
| result.itemprop = node.attribs.itemprop; | |
| if (node.type === 'tag' && node.name === 'meta') { | |
| result.content = node.attribs.content; | |
| } else if (node.type === 'tag' && node.name === 'time') { | |
| result.content = ISO8601.parse(node.attribs.datetime); | |
| } else if (node.type === 'tag' && node.name === 'a') { | |
| result.content = node.attribs.href; | |
| } else { | |
| result.content = { html: innerHTML(node), text: innerText(node) }; | |
| } | |
| return result; | |
| } | |
| return children(); | |
| } | |
| function innerHTML(node) { | |
| if (!node.children) return ''; | |
| return node | |
| .children | |
| .map(function (node) { | |
| if (node.type === 'text') return node.raw; | |
| return '<' + node.raw + '>' + innerHTML(node) + '</' + node.name + '>'; | |
| }) | |
| .join(''); | |
| } | |
| function innerText(node) { | |
| if (node.type === 'text') return decode(node.raw); | |
| if (!node.children) return ''; | |
| return node | |
| .children | |
| .map(innerText) | |
| .join(''); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment