Skip to content

Instantly share code, notes, and snippets.

@ganapativs
Created February 12, 2025 04:52
Show Gist options
  • Save ganapativs/a2255456f5209a7e9c68909f5187b6c3 to your computer and use it in GitHub Desktop.
Save ganapativs/a2255456f5209a7e9c68909f5187b6c3 to your computer and use it in GitHub Desktop.
Clean html string - remove css, script and other unnecessary content. keep html tags, json and json schema only. Useful to feed the html content to LLM for analysis
// Clean html - remove css, script, keep html tags, json and json schema only.
function cleanHtml(html) {
// First remove HTML comments
let cleaned = html.replace(/<!--[\s\S]*?-->/g, '');
// Remove unwanted tags, but preserve script tags with type="application/json" or type="application/ld+json"
cleaned = cleaned.replace(
/<script(?![^>]*type="application\/(json|ld\+json)")[^>]*>[\s\S]*?<\/script>|<style[^>]*>[\s\S]*?<\/style>|<(link|svg|img|image)[^>]*?>[\s\S]*?<\/\1>|<(link|svg|img|image)([^>]*?)\/?>|<\/(link|svg|img|image)>/gi,
''
);
// Replace any tag's attributes, keeping href and specified script types if they exist
cleaned = cleaned.replace(/<([a-z][a-z0-9]*)(?:[^>]*href="([^"]*)")?(?:[^>]*type="(application\/(?:json|ld\+json))")?[^>]*?>/gi, (match, tag, href, type) => {
const attributes = [];
if (href) attributes.push(`href="${href}"`);
if (type && tag.toLowerCase() === 'script') attributes.push(`type="${type}"`);
return `<${tag}${attributes.length ? ` ${attributes.join(' ')}` : ''}>`;
});
// Remove empty tags (can need multiple passes for nested empty tags)
// But don't remove script tags with allowed types even if empty
let prevHtml;
do {
prevHtml = cleaned;
cleaned = cleaned.replace(/<(?!script\s+type="application\/(json|ld\+json)")([a-z][a-z0-9]*)>\s*<\/\2>/gi, '');
} while (cleaned !== prevHtml);
return cleaned;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment