Created
February 14, 2014 13:37
-
-
Save HoraceBury/9001099 to your computer and use it in GitHub Desktop.
Cleans rich text so that HTML is cleanly removed, p and br tags are reduced to new lines and some special characters are replaced with the text equivelents.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local t = [[ your html ]] | |
-- list of strings to replace (the order is important to avoid conflicts) | |
local cleaner = { | |
{ "&", "&" }, -- decode ampersands | |
{ "—", "-" }, -- em dash | |
{ "’", "'" }, -- right single quote | |
{ "“", "\"" }, -- left double quote | |
{ "”", "\"" }, -- right double quote | |
{ "–", "-" }, -- en dash | |
{ " ", " " }, -- non-breaking space | |
{ "<br ?/?>", "\n" }, -- all <br> tags whether terminated or not (<br> <br/> <br />) become new lines | |
{ "</p>", "\n" }, -- ends of paragraphs become new lines | |
{ "(%b<>)", "" }, -- all other html elements are completely removed (must be done last) | |
{ "\r", "\n" }, -- return carriage become new lines | |
{ "[\n\n]+", "\n" }, -- reduce all multiple new lines with a single new line | |
{ "^\n*", "" }, -- trim new lines from the start... | |
{ "\n*$", "" }, -- ... and end | |
} | |
-- clean html from the string | |
for i=1, #cleaner do | |
local cleans = cleaner[i] | |
t = string.gsub( t, cleans[1], cleans[2] ) | |
end | |
print("["..t.."]") -- print the string with end indicators |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Flawless, many thanks.