Skip to content

Instantly share code, notes, and snippets.

@mhingston
Last active December 3, 2015 12:08
Show Gist options
  • Save mhingston/b7b1d4da43041da586e1 to your computer and use it in GitHub Desktop.
Save mhingston/b7b1d4da43041da586e1 to your computer and use it in GitHub Desktop.
A quick and dirty function to convert a string of HTML to plain text. This is primarily intended for generating plain text emails from HTML
function htmlToPlainText(html)
{
var text = html
var aRe = /<a\s+href=['"]([^'"\s]+)['"][^>]+>((?!<\/a>).+)<\/a>/i
while(aRe.test(text)) // replace anchors
{
text = text.replace(aRe, "$2 ($1)")
}
text = text.replace(/<!--(?:(?!-->)[\w\W]*?)-->/gi, "") // remove comments
text = text.replace(/<style[^>]*?>(?:(?!<\/style>)[\w\W]*?)<\/style>/gi, "") // remove style
text = text.replace(/<p[^>]*?>((?!<\/p>)[\w\W]*?)?<\/p>/gi, "$1\n\n") // replace paragraphs
text = text.replace(/<div[^>]*?>((?!<\/div>)[\w\W]*?)?<\/div>/gi, "$1\n\n") // replace div
text = text.replace(/<textarea[^>]*?>((?!<\/textarea>)[\w\W]*?)?<\/textarea>/gi, "$1\n\n") // replace textarea
text = text.replace(/<span[^>]*?>((?!<\/span>)[\w\W]*?)?<\/span>/gi, "$1") // replace span
text = text.replace(/<font[^>]*?>((?!<\/font>)[\w\W]*?)?<\/font>/gi, "$1") // replace font
text = text.replace(/<li[^>]*?>((?!<\/li>)[\w\W]*?)?<\/li>/gi, "• $1\n") // replace list items
text = text.replace(/<button[^>]*?>((?!<\/button>)[\w\W]*?)?<\/button>/gi, "$1") // replace button
text = text.replace(/<center[^>]*?>((?!<\/center>)[\w\W]*?)?<\/center>/gi, "$1") // replace center
text = text.replace(/<cite[^>]*?>((?!<\/cite>)[\w\W]*?)?<\/cite>/gi, "$1") // replace citations
text = text.replace(/<code[^>]*?>((?!<\/code>)[\w\W]*?)?<\/code>/gi, "$1") // replace code
text = text.replace(/<pre[^>]*?>((?!<\/pre>)[\w\W]*?)?<\/pre>/gi, "$1") // replace preformmated text
text = text.replace(/<tt[^>]*?>((?!<\/tt>)[\w\W]*?)?<\/tt>/gi, "$1") // replace tt
text = text.replace(/<dt[^>]*?>((?!<\/dt>)[\w\W]*?)?<\/dt>/gi, "$1") // replace dt
text = text.replace(/<dd[^>]*?>((?!<\/dd>)[\w\W]*?)?<\/dd>/gi, "$1") // replace dd
text = text.replace(/<h[1-6][^>]*?>((?!<\/h[1-6]>)[\w\W]*?)?<\/h[1-6]>/gi, function(match, p1, offset, string) // replace headings
{
if(p1)
{
return p1.toUpperCase() + "\n\n"
}
else
{
return ""
}
})
text = text.replace(/<sub[^>]*?>((?!<\/sub>)[\w\W]*?)?<\/sub>/gi, "[$1]") // replace subscript
text = text.replace(/<sup[^>]*?>((?!<\/sup>)[\w\W]*?)?<\/sup>/gi, "[$1]") // replace superscript
text = text.replace(/<q[^>]*?>((?!<\/q>)[\w\W]*?)?<\/q>/gi, "$1") // replace inline quote
text = text.replace(/<del[^>]*?>((?!<\/del>)[\w\W]*?)?<\/del>/gi, "$1") // replace delete
text = text.replace(/<th[^>]*?>((?!<\/th>)[\w\W]*?)?<\/th>/gi, "$1\n") // replace table head
text = text.replace(/<td[^>]*?>((?!<\/td>)[\w\W]*?)?<\/td>/gi, "$1\n") // replace table cell
text = text.replace(/<b[^>]*?>((?!<\/b>)[\w\W]*?)?<\/b>/gi, "*$1*") // replace bold
text = text.replace(/<strong[^>]*?>((?!<\/b>)[\w\W]*?)?<\/strong>/gi, "*$1*") // replace strong
text = text.replace(/<em[^>]*?>((?!<\/em>)[\w\W]*?)?<\/em>/gi, "*$1*") // replace emphasis tag
text = text.replace(/<i[^>]*?>((?!<\/i>)[\w\W]*?)?<\/i>/gi, "*$1*") // replace italic
text = text.replace(/<u[^>]*?>((?!<\/u>)[\w\W]*?)?<\/u>/gi, "*$1*") // replace underline
text = text.replace(/<big[^>]*?>((?!<\/big>)[\w\W]*?)?<\/big>/gi, function(match, p1, offset, string) // replace big
{
if(p1)
{
return p1.toUpperCase()
}
else
{
return ""
}
})
text = text.replace(/<small[^>]*?>((?!<\/small>)[\w\W]*?)?<\/small>/gi, function(match, p1, offset, string) // replace small
{
if(p1)
{
return p1.toLowerCase()
}
else
{
return ""
}
})
text = text.replace(/<blockquote[^>]*?>((?!<\/blockquote>)[\w\W]*?)?<\/blockquote>/gi, "$1\n\n") // replace blockquote
text = text.replace(/<br[^>]*?>\s*/gi, "\n") // replace break
text = text.replace(/<hr[^>]*?>\s*/gi, "\n") // replace horizontal rule
text = text.replace(/<[^>]+?>/gi, "") // remove any other tags
text = text.replace(/&[^;]+;/gi, "") // remove any HTML character codes
text = text.replace(/^\W+/g, "").replace(/\n{2,}\W+/g, "\n\n") // remove extra newlines
return text.trim()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment