Created
December 23, 2014 08:56
-
-
Save achun/5ba53381cfc2da1dbf3c to your computer and use it in GitHub Desktop.
合法 html 文本进行词法解析
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var LOOP = 'LOOP' + Math.random(), | |
BREAK = 'BREAK' + Math.random() | |
/** | |
htmlLex 对合法 html 文本进行词法解析. | |
参数: | |
合法 html 文本 | |
*/ | |
function htmlLex() { | |
var pos, m, k, v, html, fn | |
return function() { | |
if (arguments.length) { | |
html = Array.prototype.slice(arguments).join('') | |
fn = loop | |
} | |
return fn() | |
} | |
// 开始循环 | |
function loop(self) { | |
fn = text | |
return LOOP | |
} | |
function text(self) { | |
// 结束循环 | |
if (!html.length) { | |
fn = loop | |
return BREAK | |
} | |
// 不要覆盖 m, 后续 tag 函数要用 | |
m = (/<[!\/]?[-a-zA-Z]+/).exec(html) | |
// 最后的文本 | |
if (!m) { | |
v = html | |
html = '' | |
return ['#text', v] | |
} | |
fn = tag | |
if (!m.index) { | |
return fn() | |
} | |
v = html.slice(0, m.index) | |
html = html.slice(m.index) | |
return ['#text', v] | |
} | |
function tag() { | |
if (!m) { | |
return Error('syntax error: expect opening tag begins with "<"') | |
} | |
// comment and fragment | |
if (html[1] === '!') { | |
v = html.slice(2, pos).replace(/^-{1,2}/, '') | |
.replace(/-{1,2}$/, '').trim() | |
switch (v.slice(0, 9)) { | |
case 'fragment ': | |
fn = attr | |
html = html.slice(html.indexOf('fragment') + 9) | |
return ['<', '#fragment'] | |
case '/fragment': | |
fn = text | |
html = html.slice(pos + 1) | |
return ['</', '#fragment'] | |
default: | |
pos = html.indexOf('>') | |
if (pos === -1) | |
return Error('syntax error: End of file inside comment') | |
fn = text | |
html = html.slice(pos + 1) | |
return ['#comment', v] | |
} | |
} | |
// </closed> | |
if (html[1] === '/') { | |
if (html[m[0].length] != '>') { | |
return Error('syntax error: unexpected whitespace inside closing tag') | |
} | |
fn = text | |
html = html.slice(m[0].length + 1) | |
return ['</', m[0].slice(2)] | |
} | |
html = html.slice(m[0].length) | |
fn = attr | |
return ['<', m[0].slice(1)] | |
} | |
function attr() { | |
m = (/\S+=?|\/>|>/).exec(html) | |
if (!m) { | |
return Error('syntax error: Ignoring tag when inside an attribute value.'); | |
} | |
html = html.slice(m.index + m[0].length) | |
if (m[0] === '>' || m[0] === '/>') { | |
fn = text | |
return [m[0]] | |
} | |
// <name attr | |
if (m[0].slice(-1) !== '=') { | |
return [m[0]] | |
} | |
// <name attr= | |
v = html[0] | |
if (v != '"' && v != "'") { | |
return Error(ERROR_QUOTE); | |
} | |
// 提取属性, 支持转义 | |
pos = 0 | |
while (true) { | |
pos = html.indexOf(v, pos + 1) | |
if (pos === -1) { | |
return Error(ERROR_QUOTE); | |
} | |
// 引号转义 | |
if (html[pos - 1] !== '\\') { | |
break | |
} | |
} | |
// 继续分析 attr | |
v = [m[0].slice(0, -1), html.slice(1, pos)] | |
html = html.slice(pos + 1) | |
return v | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment