Last active
December 3, 2015 12:08
-
-
Save mhingston/b7b1d4da43041da586e1 to your computer and use it in GitHub Desktop.
A quick and dirty function to convert a string of HTML to plain text. This is primarily intended for generating plain text emails from HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function htmlToPlainText(html) | |
{ | |
var text = html | |
var aRe = /<a\s+href=['"]([^'"\s]+)['"][^>]+>((?!<\/a>).+)<\/a>/i | |
while(aRe.test(text)) // replace anchors | |
{ | |
text = text.replace(aRe, "$2 ($1)") | |
} | |
text = text.replace(/<!--(?:(?!-->)[\w\W]*?)-->/gi, "") // remove comments | |
text = text.replace(/<style[^>]*?>(?:(?!<\/style>)[\w\W]*?)<\/style>/gi, "") // remove style | |
text = text.replace(/<p[^>]*?>((?!<\/p>)[\w\W]*?)?<\/p>/gi, "$1\n\n") // replace paragraphs | |
text = text.replace(/<div[^>]*?>((?!<\/div>)[\w\W]*?)?<\/div>/gi, "$1\n\n") // replace div | |
text = text.replace(/<textarea[^>]*?>((?!<\/textarea>)[\w\W]*?)?<\/textarea>/gi, "$1\n\n") // replace textarea | |
text = text.replace(/<span[^>]*?>((?!<\/span>)[\w\W]*?)?<\/span>/gi, "$1") // replace span | |
text = text.replace(/<font[^>]*?>((?!<\/font>)[\w\W]*?)?<\/font>/gi, "$1") // replace font | |
text = text.replace(/<li[^>]*?>((?!<\/li>)[\w\W]*?)?<\/li>/gi, "• $1\n") // replace list items | |
text = text.replace(/<button[^>]*?>((?!<\/button>)[\w\W]*?)?<\/button>/gi, "$1") // replace button | |
text = text.replace(/<center[^>]*?>((?!<\/center>)[\w\W]*?)?<\/center>/gi, "$1") // replace center | |
text = text.replace(/<cite[^>]*?>((?!<\/cite>)[\w\W]*?)?<\/cite>/gi, "$1") // replace citations | |
text = text.replace(/<code[^>]*?>((?!<\/code>)[\w\W]*?)?<\/code>/gi, "$1") // replace code | |
text = text.replace(/<pre[^>]*?>((?!<\/pre>)[\w\W]*?)?<\/pre>/gi, "$1") // replace preformmated text | |
text = text.replace(/<tt[^>]*?>((?!<\/tt>)[\w\W]*?)?<\/tt>/gi, "$1") // replace tt | |
text = text.replace(/<dt[^>]*?>((?!<\/dt>)[\w\W]*?)?<\/dt>/gi, "$1") // replace dt | |
text = text.replace(/<dd[^>]*?>((?!<\/dd>)[\w\W]*?)?<\/dd>/gi, "$1") // replace dd | |
text = text.replace(/<h[1-6][^>]*?>((?!<\/h[1-6]>)[\w\W]*?)?<\/h[1-6]>/gi, function(match, p1, offset, string) // replace headings | |
{ | |
if(p1) | |
{ | |
return p1.toUpperCase() + "\n\n" | |
} | |
else | |
{ | |
return "" | |
} | |
}) | |
text = text.replace(/<sub[^>]*?>((?!<\/sub>)[\w\W]*?)?<\/sub>/gi, "[$1]") // replace subscript | |
text = text.replace(/<sup[^>]*?>((?!<\/sup>)[\w\W]*?)?<\/sup>/gi, "[$1]") // replace superscript | |
text = text.replace(/<q[^>]*?>((?!<\/q>)[\w\W]*?)?<\/q>/gi, "$1") // replace inline quote | |
text = text.replace(/<del[^>]*?>((?!<\/del>)[\w\W]*?)?<\/del>/gi, "$1") // replace delete | |
text = text.replace(/<th[^>]*?>((?!<\/th>)[\w\W]*?)?<\/th>/gi, "$1\n") // replace table head | |
text = text.replace(/<td[^>]*?>((?!<\/td>)[\w\W]*?)?<\/td>/gi, "$1\n") // replace table cell | |
text = text.replace(/<b[^>]*?>((?!<\/b>)[\w\W]*?)?<\/b>/gi, "*$1*") // replace bold | |
text = text.replace(/<strong[^>]*?>((?!<\/b>)[\w\W]*?)?<\/strong>/gi, "*$1*") // replace strong | |
text = text.replace(/<em[^>]*?>((?!<\/em>)[\w\W]*?)?<\/em>/gi, "*$1*") // replace emphasis tag | |
text = text.replace(/<i[^>]*?>((?!<\/i>)[\w\W]*?)?<\/i>/gi, "*$1*") // replace italic | |
text = text.replace(/<u[^>]*?>((?!<\/u>)[\w\W]*?)?<\/u>/gi, "*$1*") // replace underline | |
text = text.replace(/<big[^>]*?>((?!<\/big>)[\w\W]*?)?<\/big>/gi, function(match, p1, offset, string) // replace big | |
{ | |
if(p1) | |
{ | |
return p1.toUpperCase() | |
} | |
else | |
{ | |
return "" | |
} | |
}) | |
text = text.replace(/<small[^>]*?>((?!<\/small>)[\w\W]*?)?<\/small>/gi, function(match, p1, offset, string) // replace small | |
{ | |
if(p1) | |
{ | |
return p1.toLowerCase() | |
} | |
else | |
{ | |
return "" | |
} | |
}) | |
text = text.replace(/<blockquote[^>]*?>((?!<\/blockquote>)[\w\W]*?)?<\/blockquote>/gi, "$1\n\n") // replace blockquote | |
text = text.replace(/<br[^>]*?>\s*/gi, "\n") // replace break | |
text = text.replace(/<hr[^>]*?>\s*/gi, "\n") // replace horizontal rule | |
text = text.replace(/<[^>]+?>/gi, "") // remove any other tags | |
text = text.replace(/&[^;]+;/gi, "") // remove any HTML character codes | |
text = text.replace(/^\W+/g, "").replace(/\n{2,}\W+/g, "\n\n") // remove extra newlines | |
return text.trim() | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment