Handy when YQL won't work due to robots.txt blocking.
Derived from prior art by @zeuxisoo: https://gist.github.com/1016047
Handy when YQL won't work due to robots.txt blocking.
Derived from prior art by @zeuxisoo: https://gist.github.com/1016047
//Ti.include('lib/htmlparser.js'); | |
//Ti.include('lib/soupselect.js'); | |
var select = require('soupselect').select, | |
htmlparser = require('htmlparser'); | |
//var select = soupselect.select; | |
var body = '<html><head><title>Test</title></head>' | |
+ '<body>' | |
+ '<img src="http://l.yimg.com/mq/i/home/HKGallery.gif" />' | |
+ '<div id="block">' | |
+ ' <div class="row">Row 1</div>' | |
+ ' <div class="row">Row 2</div>' | |
+ '</div>' | |
+ '</body></html>'; | |
var handler = new htmlparser.DefaultHandler(function(err, dom) { | |
if (err) { | |
alert('Error: ' + err); | |
} else { | |
/* | |
var img = select(dom, 'img'); | |
img.forEach(function(img) { | |
alert('src: ' + img.attribs.src); | |
}); | |
*/ | |
var rows = select(dom, 'div.row'); | |
rows.forEach(function(row) { | |
Ti.API.info(row.children[0].data) | |
}); | |
} | |
}); | |
var parser = new htmlparser.Parser(handler); | |
parser.parseComplete(body); |
/*********************************************** | |
Copyright 2010, 2011, Chris Winberry <[email protected]>. All rights reserved. | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to | |
deal in the Software without restriction, including without limitation the | |
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | |
sell copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in | |
all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
IN THE SOFTWARE. | |
***********************************************/ | |
/* v1.8.0s */ | |
exports = {}; | |
(function () { | |
function runningInNode () { | |
return( | |
(typeof require) == "function" | |
&& | |
(typeof exports) == "object" | |
&& | |
(typeof module) == "object" | |
&& | |
(typeof __filename) == "string" | |
&& | |
(typeof __dirname) == "string" | |
); | |
} | |
if (!runningInNode()) { | |
if (!this.Tautologistics) | |
this.Tautologistics = {}; | |
else if (this.Tautologistics.NodeHtmlParser) | |
return; //NodeHtmlParser already defined! | |
this.Tautologistics.NodeHtmlParser = {}; | |
exports = this.Tautologistics.NodeHtmlParser; | |
} | |
//Types of elements found in the DOM | |
var ElementType = { | |
Text: "text" //Plain text | |
, Directive: "directive" //Special tag <!...> | |
, Comment: "comment" //Special tag <!--...--> | |
, Script: "script" //Special tag <script>...</script> | |
, Style: "style" //Special tag <style>...</style> | |
, Tag: "tag" //Any tag that isn't special | |
} | |
function Parser (handler, options) { | |
this._options = options ? options : { }; | |
if (this._options.includeLocation == undefined) { | |
this._options.includeLocation = false; //Do not track element position in document by default | |
} | |
this.validateHandler(handler); | |
this._handler = handler; | |
this.reset(); | |
} | |
//**"Static"**// | |
//Regular expressions used for cleaning up and parsing (stateless) | |
Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace | |
Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents | |
Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on | |
Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element | |
//Regular expressions used for parsing (stateful) | |
Parser._reAttrib = //Find attributes in a tag | |
/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; | |
Parser._reTags = /[\<\>]/g; //Find tag markers | |
//**Public**// | |
//Methods// | |
//Parses a complete HTML and pushes it to the handler | |
Parser.prototype.parseComplete = function Parser$parseComplete (data) { | |
this.reset(); | |
this.parseChunk(data); | |
this.done(); | |
} | |
//Parses a piece of an HTML document | |
Parser.prototype.parseChunk = function Parser$parseChunk (data) { | |
if (this._done) | |
this.handleError(new Error("Attempted to parse chunk after parsing already done")); | |
this._buffer += data; //FIXME: this can be a bottleneck | |
this.parseTags(); | |
} | |
//Tells the parser that the HTML being parsed is complete | |
Parser.prototype.done = function Parser$done () { | |
if (this._done) | |
return; | |
this._done = true; | |
//Push any unparsed text into a final element in the element list | |
if (this._buffer.length) { | |
var rawData = this._buffer; | |
this._buffer = ""; | |
var element = { | |
raw: rawData | |
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") | |
, type: this._parseState | |
}; | |
if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style) | |
element.name = this.parseTagName(element.data); | |
this.parseAttribs(element); | |
this._elements.push(element); | |
} | |
this.writeHandler(); | |
this._handler.done(); | |
} | |
//Resets the parser to a blank state, ready to parse a new HTML document | |
Parser.prototype.reset = function Parser$reset () { | |
this._buffer = ""; | |
this._done = false; | |
this._elements = []; | |
this._elementsCurrent = 0; | |
this._current = 0; | |
this._next = 0; | |
this._location = { | |
row: 0 | |
, col: 0 | |
, charOffset: 0 | |
, inBuffer: 0 | |
}; | |
this._parseState = ElementType.Text; | |
this._prevTagSep = ''; | |
this._tagStack = []; | |
this._handler.reset(); | |
} | |
//**Private**// | |
//Properties// | |
Parser.prototype._options = null; //Parser options for how to behave | |
Parser.prototype._handler = null; //Handler for parsed elements | |
Parser.prototype._buffer = null; //Buffer of unparsed data | |
Parser.prototype._done = false; //Flag indicating whether parsing is done | |
Parser.prototype._elements = null; //Array of parsed elements | |
Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed | |
Parser.prototype._current = 0; //Position in data that has already been parsed | |
Parser.prototype._next = 0; //Position in data of the next tag marker (<>) | |
Parser.prototype._location = null; //Position tracking for elements in a stream | |
Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed | |
Parser.prototype._prevTagSep = ''; //Previous tag marker found | |
//Stack of element types previously encountered; keeps track of when | |
//parsing occurs inside a script/comment/style tag | |
Parser.prototype._tagStack = null; | |
//Methods// | |
//Takes an array of elements and parses any found attributes | |
Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) { | |
var idxEnd = elements.length; | |
var idx = 0; | |
while (idx < idxEnd) { | |
var element = elements[idx++]; | |
if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style) | |
this.parseAttribs(element); | |
} | |
return(elements); | |
} | |
//Takes an element and adds an "attribs" property for any element attributes found | |
Parser.prototype.parseAttribs = function Parser$parseAttribs (element) { | |
//Only parse attributes for tags | |
if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag) | |
return; | |
var tagName = element.data.split(Parser._reWhitespace, 1)[0]; | |
var attribRaw = element.data.substring(tagName.length); | |
if (attribRaw.length < 1) | |
return; | |
var match; | |
Parser._reAttrib.lastIndex = 0; | |
while (match = Parser._reAttrib.exec(attribRaw)) { | |
if (element.attribs == undefined) | |
element.attribs = {}; | |
if (typeof match[1] == "string" && match[1].length) { | |
element.attribs[match[1]] = match[2]; | |
} else if (typeof match[3] == "string" && match[3].length) { | |
element.attribs[match[3].toString()] = match[4].toString(); | |
} else if (typeof match[5] == "string" && match[5].length) { | |
element.attribs[match[5]] = match[6]; | |
} else if (typeof match[7] == "string" && match[7].length) { | |
element.attribs[match[7]] = match[7]; | |
} | |
} | |
} | |
//Extracts the base tag name from the data value of an element | |
Parser.prototype.parseTagName = function Parser$parseTagName (data) { | |
if (data == null || data == "") | |
return(""); | |
var match = Parser._reTagName.exec(data); | |
if (!match) | |
return(""); | |
return((match[1] ? "/" : "") + match[2]); | |
} | |
//Parses through HTML text and returns an array of found elements | |
//I admit, this function is rather large but splitting up had an noticeable impact on speed | |
Parser.prototype.parseTags = function Parser$parseTags () { | |
var bufferEnd = this._buffer.length - 1; | |
while (Parser._reTags.test(this._buffer)) { | |
this._next = Parser._reTags.lastIndex - 1; | |
var tagSep = this._buffer.charAt(this._next); //The currently found tag marker | |
var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse | |
//A new element to eventually be appended to the element list | |
var element = { | |
raw: rawData | |
, data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") | |
, type: this._parseState | |
}; | |
var elementName = this.parseTagName(element.data); | |
//This section inspects the current tag stack and modifies the current | |
//element if we're actually parsing a special area (script/comment/style tag) | |
if (this._tagStack.length) { //We're parsing inside a script/comment/style tag | |
if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag | |
if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack | |
this._tagStack.pop(); | |
else { //Not a closing script tag | |
if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment | |
//All data from here to script close is now a text element | |
element.type = ElementType.Text; | |
//If the previous element is text, append the current text to it | |
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { | |
var prevElement = this._elements[this._elements.length - 1]; | |
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; | |
element.raw = element.data = ""; //This causes the current element to not be added to the element list | |
} | |
} | |
} | |
} | |
else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag | |
if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack | |
this._tagStack.pop(); | |
else { | |
if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment | |
//All data from here to style close is now a text element | |
element.type = ElementType.Text; | |
//If the previous element is text, append the current text to it | |
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { | |
var prevElement = this._elements[this._elements.length - 1]; | |
if (element.raw != "") { | |
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; | |
element.raw = element.data = ""; //This causes the current element to not be added to the element list | |
} else { //Element is empty, so just append the last tag marker found | |
prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; | |
} | |
} else { //The previous element was not text | |
if (element.raw != "") { | |
element.raw = element.data = element.raw; | |
} | |
} | |
} | |
} | |
} | |
else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag | |
var rawLen = element.raw.length; | |
if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") { | |
//Actually, we're no longer in a style tag, so pop it off the stack | |
this._tagStack.pop(); | |
//If the previous element is a comment, append the current text to it | |
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { | |
var prevElement = this._elements[this._elements.length - 1]; | |
prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); | |
element.raw = element.data = ""; //This causes the current element to not be added to the element list | |
element.type = ElementType.Text; | |
} | |
else //Previous element not a comment | |
element.type = ElementType.Comment; //Change the current element's type to a comment | |
} | |
else { //Still in a comment tag | |
element.type = ElementType.Comment; | |
//If the previous element is a comment, append the current text to it | |
if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { | |
var prevElement = this._elements[this._elements.length - 1]; | |
prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; | |
element.raw = element.data = ""; //This causes the current element to not be added to the element list | |
element.type = ElementType.Text; | |
} | |
else | |
element.raw = element.data = element.raw + tagSep; | |
} | |
} | |
} | |
//Processing of non-special tags | |
if (element.type == ElementType.Tag) { | |
element.name = elementName; | |
if (element.raw.indexOf("!--") == 0) { //This tag is really comment | |
element.type = ElementType.Comment; | |
delete element["name"]; | |
var rawLen = element.raw.length; | |
//Check if the comment is terminated in the current element | |
if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">") | |
element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); | |
else { //It's not so push the comment onto the tag stack | |
element.raw += tagSep; | |
this._tagStack.push(ElementType.Comment); | |
} | |
} | |
else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) { | |
element.type = ElementType.Directive; | |
//TODO: what about CDATA? | |
} | |
else if (element.name == "script") { | |
element.type = ElementType.Script; | |
//Special tag, push onto the tag stack if not terminated | |
if (element.data.charAt(element.data.length - 1) != "/") | |
this._tagStack.push(ElementType.Script); | |
} | |
else if (element.name == "/script") | |
element.type = ElementType.Script; | |
else if (element.name == "style") { | |
element.type = ElementType.Style; | |
//Special tag, push onto the tag stack if not terminated | |
if (element.data.charAt(element.data.length - 1) != "/") | |
this._tagStack.push(ElementType.Style); | |
} | |
else if (element.name == "/style") | |
element.type = ElementType.Style; | |
if (element.name && element.name.charAt(0) == "/") | |
element.data = element.name; | |
} | |
//Add all tags and non-empty text elements to the element list | |
if (element.raw != "" || element.type != ElementType.Text) { | |
if (this._options.includeLocation && !element.location) { | |
element.location = this.getLocation(element.type == ElementType.Tag); | |
} | |
this.parseAttribs(element); | |
this._elements.push(element); | |
//If tag self-terminates, add an explicit, separate closing tag | |
if ( | |
element.type != ElementType.Text | |
&& | |
element.type != ElementType.Comment | |
&& | |
element.type != ElementType.Directive | |
&& | |
element.data.charAt(element.data.length - 1) == "/" | |
) | |
this._elements.push({ | |
raw: "/" + element.name | |
, data: "/" + element.name | |
, name: "/" + element.name | |
, type: element.type | |
}); | |
} | |
this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text; | |
this._current = this._next + 1; | |
this._prevTagSep = tagSep; | |
} | |
if (this._options.includeLocation) { | |
this.getLocation(); | |
this._location.row += this._location.inBuffer; | |
this._location.inBuffer = 0; | |
this._location.charOffset = 0; | |
} | |
this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; | |
this._current = 0; | |
this.writeHandler(); | |
} | |
Parser.prototype.getLocation = function Parser$getLocation (startTag) { | |
var c, | |
l = this._location, | |
end = this._current - (startTag ? 1 : 0), | |
chunk = startTag && l.charOffset == 0 && this._current == 0; | |
for (; l.charOffset < end; l.charOffset++) { | |
c = this._buffer.charAt(l.charOffset); | |
if (c == '\n') { | |
l.inBuffer++; | |
l.col = 0; | |
} else if (c != '\r') { | |
l.col++; | |
} | |
} | |
return { | |
line: l.row + l.inBuffer + 1 | |
, col: l.col + (chunk ? 0: 1) | |
}; | |
} | |
//Checks the handler to make it is an object with the right "interface" | |
Parser.prototype.validateHandler = function Parser$validateHandler (handler) { | |
if ((typeof handler) != "object") | |
throw new Error("Handler is not an object"); | |
if ((typeof handler.reset) != "function") | |
throw new Error("Handler method 'reset' is invalid"); | |
if ((typeof handler.done) != "function") | |
throw new Error("Handler method 'done' is invalid"); | |
if ((typeof handler.writeTag) != "function") | |
throw new Error("Handler method 'writeTag' is invalid"); | |
if ((typeof handler.writeText) != "function") | |
throw new Error("Handler method 'writeText' is invalid"); | |
if ((typeof handler.writeComment) != "function") | |
throw new Error("Handler method 'writeComment' is invalid"); | |
if ((typeof handler.writeDirective) != "function") | |
throw new Error("Handler method 'writeDirective' is invalid"); | |
} | |
//Writes parsed elements out to the handler | |
Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) { | |
forceFlush = !!forceFlush; | |
if (this._tagStack.length && !forceFlush) | |
return; | |
while (this._elements.length) { | |
var element = this._elements.shift(); | |
switch (element.type) { | |
case ElementType.Comment: | |
this._handler.writeComment(element); | |
break; | |
case ElementType.Directive: | |
this._handler.writeDirective(element); | |
break; | |
case ElementType.Text: | |
this._handler.writeText(element); | |
break; | |
default: | |
this._handler.writeTag(element); | |
break; | |
} | |
} | |
} | |
Parser.prototype.handleError = function Parser$handleError (error) { | |
if ((typeof this._handler.error) == "function") | |
this._handler.error(error); | |
else | |
throw error; | |
} | |
//TODO: make this a trully streamable handler | |
function RssHandler (callback) { | |
RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); | |
} | |
inherits(RssHandler, DefaultHandler); | |
RssHandler.prototype.done = function RssHandler$done () { | |
var feed = { }; | |
var feedRoot; | |
var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false); | |
if (found.length) { | |
feedRoot = found[0]; | |
} | |
if (feedRoot) { | |
if (feedRoot.name == "rss") { | |
feed.type = "rss"; | |
feedRoot = feedRoot.children[0]; //<channel/> | |
feed.id = ""; | |
try { | |
feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); | |
} catch (ex) { } | |
try { | |
feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
feed.items = []; | |
DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { | |
var entry = {}; | |
try { | |
entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); | |
} catch (ex) { } | |
feed.items.push(entry); | |
}); | |
} else { | |
feed.type = "atom"; | |
try { | |
feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; | |
} catch (ex) { } | |
try { | |
feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); | |
} catch (ex) { } | |
try { | |
feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; | |
} catch (ex) { } | |
feed.items = []; | |
DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { | |
var entry = {}; | |
try { | |
entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; | |
} catch (ex) { } | |
try { | |
entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; | |
} catch (ex) { } | |
try { | |
entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); | |
} catch (ex) { } | |
feed.items.push(entry); | |
}); | |
} | |
this.dom = feed; | |
} | |
RssHandler.super_.prototype.done.call(this); | |
} | |
/////////////////////////////////////////////////// | |
function DefaultHandler (callback, options) { | |
this.reset(); | |
this._options = options ? options : { }; | |
if (this._options.ignoreWhitespace == undefined) | |
this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes | |
if (this._options.verbose == undefined) | |
this._options.verbose = true; //Keep data property for tags and raw property for all | |
if (this._options.enforceEmptyTags == undefined) | |
this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec | |
if ((typeof callback) == "function") | |
this._callback = callback; | |
} | |
//**"Static"**// | |
//HTML Tags that shouldn't contain child nodes | |
DefaultHandler._emptyTags = { | |
area: 1 | |
, base: 1 | |
, basefont: 1 | |
, br: 1 | |
, col: 1 | |
, frame: 1 | |
, hr: 1 | |
, img: 1 | |
, input: 1 | |
, isindex: 1 | |
, link: 1 | |
, meta: 1 | |
, param: 1 | |
, embed: 1 | |
} | |
//Regex to detect whitespace only text nodes | |
DefaultHandler.reWhitespace = /^\s*$/; | |
//**Public**// | |
//Properties// | |
DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML | |
//Methods// | |
//Resets the handler back to starting state | |
DefaultHandler.prototype.reset = function DefaultHandler$reset() { | |
this.dom = []; | |
this._done = false; | |
this._tagStack = []; | |
this._tagStack.last = function DefaultHandler$_tagStack$last () { | |
return(this.length ? this[this.length - 1] : null); | |
} | |
} | |
//Signals the handler that parsing is done | |
DefaultHandler.prototype.done = function DefaultHandler$done () { | |
this._done = true; | |
this.handleCallback(null); | |
} | |
DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) { | |
this.handleElement(element); | |
} | |
DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { | |
if (this._options.ignoreWhitespace) | |
if (DefaultHandler.reWhitespace.test(element.data)) | |
return; | |
this.handleElement(element); | |
} | |
DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { | |
this.handleElement(element); | |
} | |
DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) { | |
this.handleElement(element); | |
} | |
DefaultHandler.prototype.error = function DefaultHandler$error (error) { | |
this.handleCallback(error); | |
} | |
//**Private**// | |
//Properties// | |
DefaultHandler.prototype._options = null; //Handler options for how to behave | |
DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done | |
DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed | |
DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed | |
//Methods// | |
DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) { | |
if ((typeof this._callback) != "function") | |
if (error) | |
throw error; | |
else | |
return; | |
this._callback(error, this.dom); | |
} | |
DefaultHandler.prototype.isEmptyTag = function(element) { | |
var name = element.name.toLowerCase(); | |
if (name.charAt(0) == '/') { | |
name = name.substring(1); | |
} | |
return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; | |
}; | |
DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { | |
if (this._done) | |
this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); | |
if (!this._options.verbose) { | |
// element.raw = null; //FIXME: Not clean | |
//FIXME: Serious performance problem using delete | |
delete element.raw; | |
if (element.type == "tag" || element.type == "script" || element.type == "style") | |
delete element.data; | |
} | |
if (!this._tagStack.last()) { //There are no parent elements | |
//If the element can be a container, add it to the tag stack and the top level list | |
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { | |
if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag | |
this.dom.push(element); | |
if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children | |
this._tagStack.push(element); | |
} | |
} | |
} | |
else //Otherwise just add to the top level list | |
this.dom.push(element); | |
} | |
else { //There are parent elements | |
//If the element can be a container, add it as a child of the element | |
//on top of the tag stack and then add it to the tag stack | |
if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { | |
if (element.name.charAt(0) == "/") { | |
//This is a closing tag, scan the tagStack to find the matching opening tag | |
//and pop the stack up to the opening tag's parent | |
var baseName = element.name.substring(1); | |
if (!this.isEmptyTag(element)) { | |
var pos = this._tagStack.length - 1; | |
while (pos > -1 && this._tagStack[pos--].name != baseName) { } | |
if (pos > -1 || this._tagStack[0].name == baseName) | |
while (pos < this._tagStack.length - 1) | |
this._tagStack.pop(); | |
} | |
} | |
else { //This is not a closing tag | |
if (!this._tagStack.last().children) | |
this._tagStack.last().children = []; | |
this._tagStack.last().children.push(element); | |
if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children | |
this._tagStack.push(element); | |
} | |
} | |
else { //This is not a container element | |
if (!this._tagStack.last().children) | |
this._tagStack.last().children = []; | |
this._tagStack.last().children.push(element); | |
} | |
} | |
} | |
var DomUtils = { | |
testElement: function DomUtils$testElement (options, element) { | |
if (!element) { | |
return false; | |
} | |
for (var key in options) { | |
if (key == "tag_name") { | |
if (element.type != "tag" && element.type != "script" && element.type != "style") { | |
return false; | |
} | |
if (!options["tag_name"](element.name)) { | |
return false; | |
} | |
} else if (key == "tag_type") { | |
if (!options["tag_type"](element.type)) { | |
return false; | |
} | |
} else if (key == "tag_contains") { | |
if (element.type != "text" && element.type != "comment" && element.type != "directive") { | |
return false; | |
} | |
if (!options["tag_contains"](element.data)) { | |
return false; | |
} | |
} else { | |
if (!element.attribs || !options[key](element.attribs[key])) { | |
return false; | |
} | |
} | |
} | |
return true; | |
} | |
, getElements: function DomUtils$getElements (options, currentElement, recurse, limit) { | |
recurse = (recurse === undefined || recurse === null) || !!recurse; | |
limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit); | |
if (!currentElement) { | |
return([]); | |
} | |
var found = []; | |
var elementList; | |
function getTest (checkVal) { | |
return(function (value) { return(value == checkVal); }); | |
} | |
for (var key in options) { | |
if ((typeof options[key]) != "function") { | |
options[key] = getTest(options[key]); | |
} | |
} | |
if (DomUtils.testElement(options, currentElement)) { | |
found.push(currentElement); | |
} | |
if (limit >= 0 && found.length >= limit) { | |
return(found); | |
} | |
if (recurse && currentElement.children) { | |
elementList = currentElement.children; | |
} else if (currentElement instanceof Array) { | |
elementList = currentElement; | |
} else { | |
return(found); | |
} | |
for (var i = 0; i < elementList.length; i++) { | |
found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); | |
if (limit >= 0 && found.length >= limit) { | |
break; | |
} | |
} | |
return(found); | |
} | |
, getElementById: function DomUtils$getElementById (id, currentElement, recurse) { | |
var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); | |
return(result.length ? result[0] : null); | |
} | |
, getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) { | |
return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); | |
} | |
, getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) { | |
return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); | |
} | |
} | |
function inherits (ctor, superCtor) { | |
var tempCtor = function(){}; | |
tempCtor.prototype = superCtor.prototype; | |
ctor.super_ = superCtor; | |
ctor.prototype = new tempCtor(); | |
ctor.prototype.constructor = ctor; | |
} | |
exports.Parser = Parser; | |
exports.DefaultHandler = DefaultHandler; | |
exports.RssHandler = RssHandler; | |
exports.ElementType = ElementType; | |
exports.DomUtils = DomUtils; | |
})(); | |
htmlparser = exports; |
/** | |
Port of Simon Willison's Soup Select http://code.google.com/p/soupselect/ | |
http://www.opensource.org/licenses/mit-license.php | |
MIT licensed http://www.opensource.org/licenses/mit-license.php | |
*/ | |
exports = {}; | |
//var domUtils = htmlparser.DomUtils; | |
var domUtils = require('htmlparser').DomUtils; | |
var tagRe = /^[a-z0-9]+$/; | |
/* | |
/^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ | |
\---/ \---/\-------------/ \-------/ | |
| | | | | |
| | | The value | |
| | ~,|,^,$,* or = | |
| Attribute | |
Tag | |
*/ | |
var attrSelectRe = /^(\w+)?\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/; | |
/** | |
Takes an operator and a value and returns a function which can be used to | |
test other values against test provided value using the given operation | |
Used to checking attribute values for attribute selectors | |
*/ | |
function makeValueChecker(operator, value) { | |
value = typeof(value) === 'string' ? value : ''; | |
return operator ? { | |
'=': function ( test_value ) { return test_value === value; }, | |
// attribute includes value as one of a set of space separated tokens | |
'~': function ( test_value ) { return test_value ? test_value.split(/\s+/).indexOf(value) !== -1 : false; }, | |
// attribute starts with value | |
'^': function ( test_value ) { return test_value ? test_value.substr(0, value.length) === value : false; }, | |
// attribute ends with value | |
'$': function ( test_value ) { return test_value ? test_value.substr(-value.length) === value : false; }, | |
// attribute contains value | |
'*': function ( test_value ) { return test_value ? test_value.indexOf(value) !== -1 : false; }, | |
// attribute is either exactly value or starts with value- | |
'|': function ( test_value ) { return test_value ? test_value === value || | |
test_value.substr(0, value.length + 1) === value + '-' : false; }, | |
// default to just check attribute existence... | |
}[operator] : function ( test_value ) { return test_value ? true : false; }; | |
} | |
/** | |
Takes a dom tree or part of one from htmlparser and applies | |
the provided selector against. The returned value is also | |
a valid dom tree, so can be passed by into | |
htmlparser.DomUtil.* calls | |
*/ | |
exports.select = function(dom, selector) { | |
var currentContext = [dom]; | |
var found, tag, options; | |
var tokens = selector.split(/\s+/); | |
for ( var i = 0; i < tokens.length; i++ ) { | |
// Attribute selectors | |
var match = attrSelectRe.exec(tokens[i]); | |
if ( match ) { | |
var attribute = match[2], operator = match[3], value = match[4]; | |
tag = match[1]; | |
options = {}; | |
options[attribute] = makeValueChecker(operator, value); | |
found = []; | |
for (var j = 0; j < currentContext.length; j++ ) { | |
found = found.concat(domUtils.getElements(options, currentContext[j])); | |
}; | |
if ( tag ) { | |
// Filter to only those matching the tag name | |
found = domUtils.getElements({ 'tag_name': tag }, found, false); | |
} | |
currentContext = found; | |
} | |
// ID selector | |
else if ( tokens[i].indexOf('#') !== -1 ) { | |
found = []; | |
var id_selector = tokens[i].split('#', 2)[1]; | |
// need to stop on the first id found (in bad HTML)... | |
var el = null; | |
for ( var k = 0; k < currentContext.length; k++ ) { | |
// the document has no child elements but tags do so we search children to avoid | |
// returning the current element via a false positive | |
if ( typeof currentContext[k].children !== 'undefined' ) { | |
el = domUtils.getElementById(id_selector, currentContext[k].children); | |
} else { | |
el = domUtils.getElementById(id_selector, currentContext[k]); | |
} | |
if ( el ) { | |
found.push(el); | |
break; | |
} | |
} | |
if (!found[0]) { | |
currentContext = []; | |
break; | |
} | |
currentContext = found; | |
} | |
// Class selector | |
else if ( tokens[i].indexOf('.') !== -1 ) { | |
var parts = tokens[i].split('.'); | |
tag = parts[0]; | |
options = {}; | |
options['class'] = function (value) { | |
if (!value) return false; | |
var classes = value.split(/\s+/); | |
for (var i = 1, len = parts.length; i < len; i++) { | |
if (!~classes.indexOf(parts[i])) return false; | |
} | |
return true; | |
}; | |
found = []; | |
for ( var l = 0; l < currentContext.length; l++ ) { | |
var context = currentContext[l]; | |
if ( tag.length > 0 ) { | |
context = domUtils.getElementsByTagName(tag, context); | |
// don't recurse in the case we have a tag or we get children we might not want | |
found = found.concat(domUtils.getElements(options, context, false)); | |
} else { | |
found = found.concat(domUtils.getElements(options, context)); | |
} | |
}; | |
currentContext = found; | |
} | |
// Star selector | |
else if ( tokens[i] === '*' ) { | |
// nothing to do right? | |
} | |
// Tag selector | |
else { | |
if (!tagRe.test(tokens[i])) { | |
currentContext = []; | |
break; | |
} | |
found = []; | |
for ( var m = 0; m < currentContext.length; m++ ) { | |
// htmlparsers document itself has no child property - only nodes do... | |
if ( typeof currentContext[m].children !== 'undefined' ) { | |
found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m].children)); | |
} else if (i === 0) { | |
found = found.concat(domUtils.getElementsByTagName(tokens[i], currentContext[m])); | |
} | |
}; | |
currentContext = found; | |
} | |
}; | |
return currentContext; | |
}; | |
soupselect = exports; |
Why htmlparse and soupselect works for ios and fails to android on titanium.
My code :
var htmlparser = require('htmlparser');
var select = require('soupselect').select;
var handler = new htmlparser.DefaultHandler(function(err, dom) { ... }
On iOS it is ok, but when I simulated on Android, I got this message error
Message: Uncaught TypeError: undefined is not a function
[ERROR] : TiExceptionHandler: (main) [0,632] - Source: var handler = new htmlparser.DefaultHandler(function(err, dom) {
[ERROR] : V8Exception: Exception occurred at alloy/controllers/index.js:151: Uncaught TypeError: undefined is not a function
[ERROR] : File: fail readDirectory() errno=2