Created
February 28, 2011 18:13
-
-
Save ekampf/847741 to your computer and use it in GitHub Desktop.
A ruby snippet to sanitize Html (and specifically Microsoft Word's messy HTML)
(based on https://gist.github.com/139987)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This function cleans up messy HTML that was pasted by a user to a WYSIWYG editor. | |
# Specifically it also handles messy Word\Outlook generated HTML while keeping its original formattings. | |
require 'rubygems' | |
require 'sanitize' | |
def clean_up_document(html) | |
elements = %w[p b h1 h2 h3 h4 h5 h6 strong li ul ol i br div pre p] | |
attributes = { | |
'a' => ['href', 'title'], | |
'pre' => ['class'], | |
'p' => ['style'] | |
} | |
protocols = {'a' => {'href' => ['http', 'https', 'mailto', :relative]}} | |
email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i | |
html.gsub! /<style>(.|\s)*?<\/style>/, '' | |
html.gsub! /<!--(.|\s)*?-->/ , '' | |
html.gsub! /[\n|\r]/ , '' | |
# keep only the things we want. | |
html = Sanitize.clean( html, :elements => elements, :attributes => attributes, :protocols => protocols ) | |
# butt up any tags | |
html.gsub! / / , ' ' | |
html.gsub! />\s+</ , '><' | |
#remove email address lines | |
html.gsub! email_regex , '<p>' | |
# post sanitize cleanup of empty blocks | |
# the order of removal is import - this is the way word stacks these elements | |
html.gsub! /<i><\/i>/ , '' | |
html.gsub! /<b><\/b>/ , '' | |
html.gsub! /<\/b><b>/ , '' | |
html.gsub! /<p><\/p>/ , '' | |
html.gsub! /<p><b><\/b><\/p>/ , '' | |
# misc - fix butted times | |
html.gsub! /(\d)am / , '\1 am ' | |
html.gsub! /(\d)pm / , '\1 pm ' | |
# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example) | |
html.gsub! /\s+/ , ' ' | |
# add new lines at the end of lines | |
html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n" | |
html.gsub! /<dl>/ , '<dl>' + "\n" | |
html | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment