Last active
April 30, 2024 19:47
-
-
Save Korayem/81059ab7bdf175582ef4fc23f10165aa to your computer and use it in GitHub Desktop.
Javascript only solution to sanitize HTML content then convert it to markdown. All solutions were npm packages with tons of dependencies and I needed a simple solution to run in the browser client side
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Generetad by GPT-4-turbo | |
------------------------------------------------------------------------------------------------------------ | |
Prompt 1: | |
let’s create a function that cleans up an input html | |
1. Remove all scripts tags | |
2. Remove all iframes tags | |
3. Remove `<head>` | |
4. Remove CSS style tags | |
5. Remove `<link>` tag | |
6. Remove `<noscript>` tags | |
7. remove any properties on html and body tags | |
8. remove <!DOCTYPE html> even if it has attributes | |
9. Remove any html comments | |
10. Clean any attributes from all tags | |
11. finally remove any tags with no content inside like `<div></div>` | |
12. don’t use any loops like `while` | |
Note the runtime I am in doesnt support DOMParser or node packages. The answer should be vanillaJS. | |
Take a deep breath and write it in Javascript. the function name is `cleanHtml()` | |
------------------------------------------------------------------------------------------------------------ | |
Prompt 2: | |
now write a method htmlToMarkdown() to convert output to markdown format | |
*/ | |
function cleanHtml(html) { | |
// Remove script tags and their content | |
html = html.replace(/<script[^>]*>([\s\S]*?)<\/script>/gi, ''); | |
// Remove iframe tags and their content | |
html = html.replace(/<iframe[^>]*>([\s\S]*?)<\/iframe>/gi, ''); | |
// Remove the entire head tag | |
html = html.replace(/<head[^>]*>([\s\S]*?)<\/head>/gi, ''); | |
// Remove style tags and their content | |
html = html.replace(/<style[^>]*>([\s\S]*?)<\/style>/gi, ''); | |
// Remove link tags | |
html = html.replace(/<link[^>]*>/gi, ''); | |
// Remove noscript tags | |
html = html.replace(/<noscript[^>]*>([\s\S]*?)<\/noscript>/gi, ''); | |
// Remove properties from html and body tags | |
html = html.replace(/<(html|body)([^>]*?)>/gi, '<$1>'); | |
// Remove DOCTYPE | |
html = html.replace(/<!DOCTYPE[^>]*>/gi, ''); | |
// Remove HTML comments | |
html = html.replace(/<!--[\s\S]*?-->/g, ''); | |
// Clean attributes from all tags | |
html = html.replace(/<([a-zA-Z0-9]+)([^>]*?)>/g, '<$1>'); | |
// Remove empty tags | |
html = html.replace(/<([a-zA-Z0-9]+)><\/\1>/g, ''); | |
return html; | |
} | |
function htmlToMarkdown(html) { | |
// Convert headings | |
html = html.replace(/<h([1-6])>(.*?)<\/h\1>/gi, (_, level, content) => `${'#'.repeat(level)} ${content}`); | |
// Convert paragraphs | |
html = html.replace(/<p>(.*?)<\/p>/gi, (_, content) => `${content}\n\n`); | |
// Convert line breaks | |
html = html.replace(/<br>/gi, '\n'); | |
// Convert bold text | |
html = html.replace(/<strong>(.*?)<\/strong>/gi, (_, content) => `**${content}**`); | |
html = html.replace(/<b>(.*?)<\/b>/gi, (_, content) => `**${content}**`); | |
// Convert italic text | |
html = html.replace(/<em>(.*?)<\/em>/gi, (_, content) => `*${content}*`); | |
html = html.replace(/<i>(.*?)<\/i>/gi, (_, content) => `*${content}*`); | |
// Convert unordered lists | |
html = html.replace(/<ul>([\s\S]*?)<\/ul>/gi, (_, content) => { | |
return content.replace(/<li>(.*?)<\/li>/gi, (_, item) => `* ${item}\n`); | |
}); | |
// Convert ordered lists | |
html = html.replace(/<ol>([\s\S]*?)<\/ol>/gi, (_, content) => { | |
let counter = 0; | |
return content.replace(/<li>(.*?)<\/li>/gi, (_, item) => `${++counter}. ${item}\n`); | |
}); | |
// Convert links | |
html = html.replace(/<a href="([^"]+)"[^>]*>(.*?)<\/a>/gi, (_, href, content) => `[${content}](${href})`); | |
// Strip all other HTML tags | |
html = html.replace(/<[^>]+>/g, ''); | |
return html.trim(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment