Skip to content

Instantly share code, notes, and snippets.

@achun
Created December 23, 2014 08:56
Show Gist options
  • Save achun/5ba53381cfc2da1dbf3c to your computer and use it in GitHub Desktop.
Save achun/5ba53381cfc2da1dbf3c to your computer and use it in GitHub Desktop.
合法 html 文本进行词法解析
var LOOP = 'LOOP' + Math.random(),
BREAK = 'BREAK' + Math.random()
/**
htmlLex 对合法 html 文本进行词法解析.
参数:
合法 html 文本
*/
function htmlLex() {
var pos, m, k, v, html, fn
return function() {
if (arguments.length) {
html = Array.prototype.slice(arguments).join('')
fn = loop
}
return fn()
}
// 开始循环
function loop(self) {
fn = text
return LOOP
}
function text(self) {
// 结束循环
if (!html.length) {
fn = loop
return BREAK
}
// 不要覆盖 m, 后续 tag 函数要用
m = (/<[!\/]?[-a-zA-Z]+/).exec(html)
// 最后的文本
if (!m) {
v = html
html = ''
return ['#text', v]
}
fn = tag
if (!m.index) {
return fn()
}
v = html.slice(0, m.index)
html = html.slice(m.index)
return ['#text', v]
}
function tag() {
if (!m) {
return Error('syntax error: expect opening tag begins with "<"')
}
// comment and fragment
if (html[1] === '!') {
v = html.slice(2, pos).replace(/^-{1,2}/, '')
.replace(/-{1,2}$/, '').trim()
switch (v.slice(0, 9)) {
case 'fragment ':
fn = attr
html = html.slice(html.indexOf('fragment') + 9)
return ['<', '#fragment']
case '/fragment':
fn = text
html = html.slice(pos + 1)
return ['</', '#fragment']
default:
pos = html.indexOf('>')
if (pos === -1)
return Error('syntax error: End of file inside comment')
fn = text
html = html.slice(pos + 1)
return ['#comment', v]
}
}
// </closed>
if (html[1] === '/') {
if (html[m[0].length] != '>') {
return Error('syntax error: unexpected whitespace inside closing tag')
}
fn = text
html = html.slice(m[0].length + 1)
return ['</', m[0].slice(2)]
}
html = html.slice(m[0].length)
fn = attr
return ['<', m[0].slice(1)]
}
function attr() {
m = (/\S+=?|\/>|>/).exec(html)
if (!m) {
return Error('syntax error: Ignoring tag when inside an attribute value.');
}
html = html.slice(m.index + m[0].length)
if (m[0] === '>' || m[0] === '/>') {
fn = text
return [m[0]]
}
// <name attr
if (m[0].slice(-1) !== '=') {
return [m[0]]
}
// <name attr=
v = html[0]
if (v != '"' && v != "'") {
return Error(ERROR_QUOTE);
}
// 提取属性, 支持转义
pos = 0
while (true) {
pos = html.indexOf(v, pos + 1)
if (pos === -1) {
return Error(ERROR_QUOTE);
}
// 引号转义
if (html[pos - 1] !== '\\') {
break
}
}
// 继续分析 attr
v = [m[0].slice(0, -1), html.slice(1, pos)]
html = html.slice(pos + 1)
return v
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment