Created
February 12, 2025 04:52
-
-
Save ganapativs/a2255456f5209a7e9c68909f5187b6c3 to your computer and use it in GitHub Desktop.
Clean html string - remove css, script and other unnecessary content. keep html tags, json and json schema only. Useful to feed the html content to LLM for analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Clean html - remove css, script, keep html tags, json and json schema only. | |
function cleanHtml(html) { | |
// First remove HTML comments | |
let cleaned = html.replace(/<!--[\s\S]*?-->/g, ''); | |
// Remove unwanted tags, but preserve script tags with type="application/json" or type="application/ld+json" | |
cleaned = cleaned.replace( | |
/<script(?![^>]*type="application\/(json|ld\+json)")[^>]*>[\s\S]*?<\/script>|<style[^>]*>[\s\S]*?<\/style>|<(link|svg|img|image)[^>]*?>[\s\S]*?<\/\1>|<(link|svg|img|image)([^>]*?)\/?>|<\/(link|svg|img|image)>/gi, | |
'' | |
); | |
// Replace any tag's attributes, keeping href and specified script types if they exist | |
cleaned = cleaned.replace(/<([a-z][a-z0-9]*)(?:[^>]*href="([^"]*)")?(?:[^>]*type="(application\/(?:json|ld\+json))")?[^>]*?>/gi, (match, tag, href, type) => { | |
const attributes = []; | |
if (href) attributes.push(`href="${href}"`); | |
if (type && tag.toLowerCase() === 'script') attributes.push(`type="${type}"`); | |
return `<${tag}${attributes.length ? ` ${attributes.join(' ')}` : ''}>`; | |
}); | |
// Remove empty tags (can need multiple passes for nested empty tags) | |
// But don't remove script tags with allowed types even if empty | |
let prevHtml; | |
do { | |
prevHtml = cleaned; | |
cleaned = cleaned.replace(/<(?!script\s+type="application\/(json|ld\+json)")([a-z][a-z0-9]*)>\s*<\/\2>/gi, ''); | |
} while (cleaned !== prevHtml); | |
return cleaned; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment