Last active
November 15, 2022 06:05
-
-
Save pofulu/bea9cdc95e1e11fb9cee3acbbb51888f to your computer and use it in GitHub Desktop.
Convert between HTML/JSON, it's original from https://github.com/Jxck/html2json, this is the TypeScript version. It may not be type perfect, but it will keep the type compiler happy.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { htmlParser } from './htmlparser' | |
const DEBUG = false; | |
const debug = DEBUG ? console.log.bind(console) : function () { }; | |
function removeDOCTYPE(html: string) { | |
return html | |
.replace(/<\?xml.*\?>\n/, '') | |
.replace(/<!doctype.*\>\n/, '') | |
.replace(/<!DOCTYPE.*\>\n/, ''); | |
} | |
export interface HTMLJSON { | |
node: string; | |
attr?: { [x: string]: string; }; | |
tag?: string; | |
child: HTMLJSON[]; | |
text?: string; | |
} | |
export function html2json(html: any) { | |
html = removeDOCTYPE(html); | |
var bufArray: any[] = []; | |
var results: HTMLJSON = { | |
node: 'root', | |
child: [], | |
}; | |
htmlParser(html, { | |
start: (tag: any, attrs: any[], unary: any) => { | |
debug(tag, attrs, unary); | |
// node for this element | |
const node: HTMLJSON = { | |
node: 'element', | |
tag: tag, | |
child: [], | |
}; | |
if (attrs.length !== 0) { | |
node.attr = attrs.reduce((pre: { [x: string]: any; }, attr: { name: string; value: any; }) => { | |
var name = attr.name; | |
var value = attr.value; | |
// has multi attibutes | |
// make it array of attribute | |
if (value.match(/ /)) { | |
value = value.split(' '); | |
} | |
// if attr already exists | |
// merge it | |
if (pre[name]) { | |
if (Array.isArray(pre[name])) { | |
// already array, push to last | |
pre[name].push(value); | |
} else { | |
// single value, make it array | |
pre[name] = [pre[name], value]; | |
} | |
} else { | |
// not exist, put it | |
pre[name] = value; | |
} | |
return pre; | |
}, {}); | |
} | |
if (unary) { | |
// if this tag dosen't have end tag | |
// like <img src="hoge.png"/> | |
// add to parents | |
var parent: HTMLJSON = bufArray[0] || results; | |
if (parent.child === undefined) { | |
parent.child = []; | |
} | |
parent.child.push(node); | |
} else { | |
bufArray.unshift(node); | |
} | |
}, | |
end: (tag: any) => { | |
debug(tag); | |
// merge into parent tag | |
var node = bufArray.shift(); | |
if (node.tag !== tag) console.error('invalid state: mismatch end tag'); | |
if (bufArray.length === 0) { | |
results.child.push(node); | |
} else { | |
var parent = bufArray[0]; | |
if (parent.child === undefined) { | |
parent.child = []; | |
} | |
parent.child.push(node); | |
} | |
}, | |
chars: (text: any) => { | |
debug(text); | |
const node: HTMLJSON = { | |
node: 'text', | |
text: text, | |
child: [] | |
}; | |
if (bufArray.length === 0) { | |
results.child.push(node); | |
} else { | |
var parent = bufArray[0]; | |
if (parent.child === undefined) { | |
parent.child = []; | |
} | |
parent.child.push(node); | |
} | |
}, | |
comment: (text: any) => { | |
debug(text); | |
const node = { | |
node: 'comment', | |
text: text, | |
}; | |
const parent = bufArray[0]; | |
if (parent.child === undefined) { | |
parent.child = []; | |
} | |
parent.child.push(node); | |
}, | |
}); | |
return results; | |
}; | |
export function json2html(json: HTMLJSON) { | |
// Empty Elements - HTML 4.01 | |
const empty = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param', 'embed']; | |
let child = ''; | |
if (json.child) { | |
child = json.child.map(function (c: any) { | |
return json2html(c); | |
}).join(''); | |
} | |
let attr = ''; | |
if (json.attr) { | |
attr = Object.keys(json.attr).map((key) => { | |
let value = json.attr?.[key] ?? ''; | |
if (Array.isArray(value)) { | |
value = value.join(' '); | |
} | |
return `${key}="${value}"`; | |
}).join(' '); | |
if (attr !== '') { | |
attr = ` ${attr}`; | |
} | |
} | |
if (json.node === 'element') { | |
var tag = json.tag; | |
if (tag && empty.indexOf(tag) > -1) { | |
// empty element | |
return `<${json.tag + attr}/>`; | |
} | |
// non empty element | |
const open = `<${json.tag + attr}>`; | |
const close = `</${json.tag}>`; | |
return open + child + close; | |
} | |
if (json.node === 'text') { | |
return json.text; | |
} | |
if (json.node === 'comment') { | |
return `<!--${json.text}-->`; | |
} | |
if (json.node === 'root') { | |
return child; | |
} | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* HTML5 Parser By Sam Blowes | |
* | |
* Designed for HTML5 documents | |
* | |
* Original code by John Resig (ejohn.org) | |
* http://ejohn.org/blog/pure-javascript-html-parser/ | |
* Original code by Erik Arvidsson, Mozilla Public License | |
* http://erik.eae.net/simplehtmlparser/simplehtmlparser.js | |
* | |
* ---------------------------------------------------------------------------- | |
* License | |
* ---------------------------------------------------------------------------- | |
* | |
* This code is triple licensed using Apache Software License 2.0, | |
* Mozilla Public License or GNU Public License | |
* | |
* //////////////////////////////////////////////////////////////////////////// | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
* use this file except in compliance with the License. You may obtain a copy | |
* of the License at http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* //////////////////////////////////////////////////////////////////////////// | |
* | |
* The contents of this file are subject to the Mozilla Public License | |
* Version 1.1 (the "License"); you may not use this file except in | |
* compliance with the License. You may obtain a copy of the License at | |
* http://www.mozilla.org/MPL/ | |
* | |
* Software distributed under the License is distributed on an "AS IS" | |
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the | |
* License for the specific language governing rights and limitations | |
* under the License. | |
* | |
* The Original Code is Simple HTML Parser. | |
* | |
* The Initial Developer of the Original Code is Erik Arvidsson. | |
* Portions created by Erik Arvidssson are Copyright (C) 2004. All Rights | |
* Reserved. | |
* | |
* //////////////////////////////////////////////////////////////////////////// | |
* | |
* This program is free software; you can redistribute it and/or | |
* modify it under the terms of the GNU General Public License | |
* as published by the Free Software Foundation; either version 2 | |
* of the License, or (at your option) any later version. | |
* | |
* This program is distributed in the hope that it will be useful, | |
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
* GNU General Public License for more details. | |
* | |
* You should have received a copy of the GNU General Public License | |
* along with this program; if not, write to the Free Software | |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
* | |
* ---------------------------------------------------------------------------- | |
* Usage | |
* ---------------------------------------------------------------------------- | |
* | |
* // Use like so: | |
* HTMLParser(htmlString, { | |
* start: function(tag, attrs, unary) {}, | |
* end: function(tag) {}, | |
* chars: function(text) {}, | |
* comment: function(text) {} | |
* }); | |
* | |
* // or to get an XML string: | |
* HTMLtoXML(htmlString); | |
* | |
* // or to get an XML DOM Document | |
* HTMLtoDOM(htmlString); | |
* | |
* // or to inject into an existing document/DOM node | |
* HTMLtoDOM(htmlString, document); | |
* HTMLtoDOM(htmlString, document.body); | |
* | |
*/ | |
// Regular Expressions for parsing tags and attributes | |
const startTag = /^<([-A-Za-z0-9_]+)((?:\s+[a-zA-Z_:][-a-zA-Z0-9_:.]*(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/; | |
const endTag = /^<\/([-A-Za-z0-9_]+)[^>]*>/; | |
const attr = /([a-zA-Z_:][-a-zA-Z0-9_:.]*)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/g; | |
// Empty Elements - HTML 5 | |
const empty = makeMap('area,base,basefont,br,col,frame,hr,img,input,link,meta,param,embed,command,keygen,source,track,wbr'); | |
// Block Elements - HTML 5 | |
const block = makeMap('a,address,article,applet,aside,audio,blockquote,button,canvas,center,dd,del,dir,div,dl,dt,fieldset,figcaption,figure,footer,form,frameset,h1,h2,h3,h4,h5,h6,header,hgroup,hr,iframe,ins,isindex,li,map,menu,noframes,noscript,object,ol,output,p,pre,section,script,table,tbody,td,tfoot,th,thead,tr,ul,video'); | |
// Inline Elements - HTML 5 | |
const inline = makeMap('abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,textarea,tt,u,var'); | |
// Elements that you can, intentionally, leave open | |
// (and which close themselves) | |
const closeSelf = makeMap('colgroup,dd,dt,li,options,p,td,tfoot,th,thead,tr'); | |
// Attributes that have their values filled in disabled='disabled' | |
const fillAttrs = makeMap('checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected'); | |
// Special Elements (can contain anything) | |
const special = makeMap('script,style'); | |
export function htmlParser( | |
html: string, | |
handler: { | |
start: (tag: string, attrs: any, unary: any) => void; | |
end: (tag: string) => void; | |
chars: (text: string) => void; | |
comment: (text: string) => void | |
}) { | |
let index: number | undefined; | |
let chars: boolean; | |
let match: string[] | null; | |
let stack: any = []; | |
let last = html; | |
stack.last = function () { | |
return this[this.length - 1]; | |
}; | |
while (html) { | |
chars = true; | |
// Make sure we're not in a script or style element | |
if (!stack.last() || !special[stack.last()]) { | |
// Comment | |
if (html.indexOf("<!--") == 0) { | |
index = html.indexOf("-->"); | |
if (index >= 0) { | |
if (handler.comment) | |
handler.comment(html.substring(4, index)); | |
html = html.substring(index + 3); | |
chars = false; | |
} | |
// end tag | |
} else if (html.indexOf("</") == 0) { | |
match = html.match(endTag); | |
if (match) { | |
html = html.substring(match[0].length); | |
match[0].replace(endTag, parseEndTag); | |
chars = false; | |
} | |
// start tag | |
} else if (html.indexOf("<") == 0) { | |
match = html.match(startTag); | |
if (match) { | |
html = html.substring(match[0].length); | |
match[0].replace(startTag, parseStartTag); | |
chars = false; | |
} | |
} | |
if (chars) { | |
index = html.indexOf("<"); | |
var text = index < 0 ? html : html.substring(0, index); | |
html = index < 0 ? "" : html.substring(index); | |
if (handler.chars) | |
handler.chars(text); | |
} | |
} else { | |
html = html.replace(new RegExp("([\\s\\S]*?)<\/" + stack.last() + "[^>]*>"), function (all: any, text: string) { | |
text = text.replace(/<!--([\s\S]*?)-->|<!\[CDATA\[([\s\S]*?)]]>/g, "$1$2"); | |
if (handler.chars) | |
handler.chars(text); | |
return ""; | |
}); | |
parseEndTag("", stack.last()); | |
} | |
if (html == last) | |
throw "Parse Error: " + html; | |
last = html; | |
} | |
// Clean up any remaining tags | |
parseEndTag(); | |
function parseStartTag(tag: string, tagName: string, rest: { replace: (arg0: RegExp, arg1: (match: any, name: any) => void) => void; }, unary: any) { | |
tagName = tagName.toLowerCase(); | |
if (block[tagName]) { | |
while (stack.last() && inline[stack.last()]) { | |
parseEndTag("", stack.last()); | |
} | |
} | |
if (closeSelf[tagName] && stack.last() == tagName) { | |
parseEndTag("", tagName); | |
} | |
unary = empty[tagName] || !!unary; | |
if (!unary) | |
stack.push(tagName); | |
if (handler.start) { | |
var attrs: { | |
name: any; value: any; escaped: any; //" | |
}[] = []; | |
rest.replace(attr, function (match: any, name: string | number) { | |
var value = arguments[2] ? arguments[2] : | |
arguments[3] ? arguments[3] : | |
arguments[4] ? arguments[4] : | |
fillAttrs[name] ? name : ""; | |
attrs.push({ | |
name: name, | |
value: value, | |
escaped: value.replace(/(^|[^\\])"/g, '$1\\\"') //" | |
}); | |
}); | |
if (handler.start) | |
handler.start(tagName, attrs, unary); | |
} | |
return '' | |
} | |
function parseEndTag(tag?: string, tagName?: string) { | |
// If no tag name is provided, clean shop | |
if (!tagName) | |
var pos = 0; | |
// Find the closest opened tag of the same type | |
else | |
for (var pos = stack.length - 1; pos >= 0; pos--) | |
if (stack[pos] == tagName) | |
break; | |
if (pos >= 0) { | |
// Close all the open elements, up the stack | |
for (var i = stack.length - 1; i >= pos; i--) | |
if (handler.end) | |
handler.end(stack[i]); | |
// Remove the open elements from the stack | |
stack.length = pos; | |
} | |
return '' | |
} | |
}; | |
function makeMap(str: string) { | |
const obj: { [key: string]: boolean } = {}; | |
const items = str.split(","); | |
for (var i = 0; i < items.length; i++) | |
obj[items[i]] = true; | |
return obj; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { readFile } from 'fs/promise'; | |
import { html2json, json2html} from './html2json'; | |
(async () => { | |
const result = await readFile('index.html', 'utf-8'); | |
const json = html2json(result); | |
console.log(json); | |
})(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment