Created
January 21, 2015 17:05
-
-
Save shaosh/697b796889c679bfda37 to your computer and use it in GitHub Desktop.
Simple HTML Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function HtmlParser(){} | |
function Stack(){ | |
this.stac = []; | |
this.pop = function(){ | |
return this.stac.pop(); | |
}; | |
this.push = function(item){ | |
this.stac.push(item); | |
}; | |
this.peek = function(){ | |
if(!this.empty()){ | |
return this.stac[this.stac.length - 1]; | |
} | |
else{ | |
return null; | |
} | |
}; | |
this.empty = function(){ | |
return this.stac.length === 0; | |
}; | |
} | |
var result = []; | |
//Reference: | |
//http://pickerel.iteye.com/blog/264252 | |
//http://erik.eae.net/simplehtmlparser/simplehtmlparser.js | |
HtmlParser.prototype = { | |
startTagRe: /^<([^>\s\/]+)((\s+[^=>\s]+(\s*=\s*((\"[^\"]*\")|(\'[^\']*\')|[^>\s]+))?)*)\s*\/?\s*>/m, | |
endTagRe: /^<\/([^>\s]+)[^>]*>/m, | |
attrRe: /([^=\s]+)(\s*=\s*((\"([^\"]*)\")|(\'([^\']*)\')|[^>\s]+))?/gm, | |
typeRe: /^<([^>\s\/]+)[>\s]/m, | |
parse: function(s){ | |
var lc, | |
lm, | |
rc, | |
index, | |
o, | |
content = '', | |
i = 0, | |
treatAsChars = false | |
stack = new Stack(); | |
while(s.length > 0){ | |
//Comment | |
if(s.substring(0, 4) === '<!--'){ | |
index = s.indexOf('-->'); | |
if(index < 0){ | |
treatAsChars = true; | |
} | |
else{ | |
s = s.substring(index + 3); | |
treatAsChars = false; | |
} | |
} | |
//End tag | |
else if(s.substring(0, 2) === '</'){ | |
if(this.endTagRe.test(s)){ | |
var eTag; | |
lc = RegExp.leftContext; | |
lm = RegExp.lastMatch; | |
rc = RegExp.rightContext; | |
s = rc; | |
eTag = this.parseEndTag(lm); | |
if(stack.peek() !== null && eTag === result[stack.peek()].type){ | |
stack.pop(); | |
} | |
console.log(stack); | |
if(content !== ''){ | |
o.content = content; | |
} | |
treatAsChars = false; | |
} | |
else{ | |
treatAsChars = true; | |
} | |
} | |
//Start tag | |
else if(s.substring(0, 1) === '<'){ | |
if(this.startTagRe.test(s)){ | |
lc = RegExp.leftContext; | |
lm = RegExp.lastMatch; | |
rc = RegExp.rightContext; | |
s = rc; | |
treatAsChars = false; | |
o = this.parseStartTag(lm); | |
if(content !== '' && stack.peek() !== null){ | |
result[stack.peek()].content = content; | |
} | |
result.push(o); | |
stack.push(i); | |
i++; | |
} | |
else{ | |
treatAsChars = true; | |
} | |
} | |
content = ''; | |
//If the first char is not '<' | |
if(treatAsChars){ | |
index = s.indexOf('<'); | |
if(index < 0){ | |
s = ''; | |
} | |
else{ | |
content = s.substring(0, index); | |
s = s.substring(index); | |
} | |
} | |
treatAsChars = true; | |
} | |
}, | |
parseStartTag: function(sTag){ | |
var attrs, | |
type, | |
obj, | |
lm, | |
rc; | |
this.typeRe.test(sTag); | |
lm = RegExp.lastMatch; | |
rc = RegExp.rightContext; | |
type = lm.substring(1, lm.length - 1); | |
obj = {}; | |
obj.type = type; | |
if(rc.length !== 0){ | |
this.parseAttrs(rc.substring(0, rc.length - 1).trim(), obj); | |
} | |
return obj; | |
}, | |
parseEndTag: function(eTag){ | |
var lm, | |
type; | |
this.endTagRe.test(eTag); | |
lm = RegExp.lastMatch; | |
type = lm.substring(2, lm.length - 1).trim(); | |
return type; | |
}, | |
parseAttrs: function(attrs, obj){ | |
var lm, | |
flag, | |
attr; | |
while(this.attrRe.test(attrs)){ | |
lm = RegExp.lastMatch; | |
attrs = RegExp.rightContext; | |
this.parseAttr(lm, obj); | |
} | |
}, | |
parseAttr: function(attr, obj){ | |
var name, | |
equalIndex, | |
property; | |
equalIndex = attr.indexOf('='); | |
name = attr.substring(0, equalIndex).trim(); | |
property = attr.substring(equalIndex + 1).trim(); | |
obj[name] = property.substring(1, property.length - 1); | |
} | |
}; | |
//Testing | |
var parser = new HtmlParser(), | |
sample = "<div><ul id='myList'><li class='hello'>Hello</li><li customTag='Earth'>World</li></ul></div><a href='http://famo.us'>Click</a>", | |
sample2 = "<div>content of div<ul id='myList'><li class='hello'>Hello</li>content of ul<li customTag='Earth'>World</li></ul></div><a href='http://famo.us'>Click</a>", | |
samples = [sample, sample2]; | |
for(var i = 0; i < samples.length; i++){ | |
console.log('Example Input:\n', samples[i]); | |
result = []; | |
parser.parse(samples[i]); | |
for(var j = 0; j < result.length; j++){ | |
if(j === 0){ | |
console.log('result:\n['); | |
} | |
console.log('', result[j]); | |
if(j === result.length - 1){ | |
console.log(']'); | |
} | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment