ekampf · February 28, 2011 18:13
diff --git a/gistfile1.rb b/gistfile1.rb
 # This function cleans up messy HTML that was pasted by a user to a WYSIWYG editor.
 # Specifically it also handles messy Word\Outlook generated HTML while keeping its original formattings.

 require 'rubygems'
 require 'sanitize'

  def clean_up_document(html)
    elements = %w[p b h1 h2 h3 h4 h5 h6 strong li ul ol i br div pre p]
    attributes = {
                  'a' => ['href', 'title'], 
                  'pre' => ['class'],
                  'p' => ['style']
                  }
    protocols = {'a' => {'href' => ['http', 'https', 'mailto', :relative]}}
                  
    email_regex = /<p>Email:\s+((\w|\-|\_|\.)+\@((\w|\-|\_)+\.)+[a-zA-Z]{2,})/i

    html.gsub! /<style>(.|\s)*?<\/style>/, ''
    html.gsub! /<!--(.|\s)*?-->/  , ''
    html.gsub! /[\n|\r]/          , ''
    

    # keep only the things we want.
    html = Sanitize.clean( html, :elements => elements, :attributes => attributes, :protocols => protocols )

    # butt up any tags
    html.gsub! /&nbsp;/                 , ' '
    html.gsub! />\s+</                  , '><'
    
    #remove email address lines
    html.gsub! email_regex              , '<p>'
    
    # post sanitize cleanup of empty blocks
    # the order of removal is import - this is the way word stacks these elements
    html.gsub! /<i><\/i>/               , ''
    html.gsub! /<b><\/b>/               , ''
    html.gsub! /<\/b><b>/               , ''
    html.gsub! /<p><\/p>/               , ''
    html.gsub! /<p><b><\/b><\/p>/       , ''
    
    # misc - fix butted times
    html.gsub! /(\d)am /          , '\1 am '
    html.gsub! /(\d)pm /          , '\1 pm '
    # misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
    html.gsub! /\s+/                  , ' '
    
    # add new lines at the end of lines
    html.gsub! /<\/(p|h\d|dt|dd|dl)>/, '</\1>' + "\n"
    html.gsub! /<dl>/             , '<dl>' + "\n"

    html
  end
	# This function cleans up messy HTML that was pasted by a user to a WYSIWYG editor.
	# Specifically it also handles messy Word\Outlook generated HTML while keeping its original formattings.

	require 'rubygems'
	require 'sanitize'

	def clean_up_document(html)
	elements = %w[p b h1 h2 h3 h4 h5 h6 strong li ul ol i br div pre p]
	attributes = {
	'a' => ['href', 'title'],
	'pre' => ['class'],
	'p' => ['style']
	}
	protocols = {'a' => {'href' => ['http', 'https', 'mailto', :relative]}}

	email_regex = /<p>Email:\s+((\w\|\-\|\_\|\.)+\@((\w\|\-\|\_)+\.)+[a-zA-Z]{2,})/i

	html.gsub! /<style>(.\|\s)*?<\/style>/, ''
	html.gsub! /<!--(.\|\s)*?-->/ , ''
	html.gsub! /[\n\|\r]/ , ''


	# keep only the things we want.
	html = Sanitize.clean( html, :elements => elements, :attributes => attributes, :protocols => protocols )

	# butt up any tags
	html.gsub! / / , ' '
	html.gsub! />\s+</ , '><'

	#remove email address lines
	html.gsub! email_regex , '<p>'

	# post sanitize cleanup of empty blocks
	# the order of removal is import - this is the way word stacks these elements
	html.gsub! /<i><\/i>/ , ''
	html.gsub! /<b><\/b>/ , ''
	html.gsub! /<\/b><b>/ , ''
	html.gsub! /<p><\/p>/ , ''
	html.gsub! /<p><b><\/b><\/p>/ , ''

	# misc - fix butted times
	html.gsub! /(\d)am / , '\1 am '
	html.gsub! /(\d)pm / , '\1 pm '
	# misc - remove multiple space that may cause doc specific regexs to fail (in dates for example)
	html.gsub! /\s+/ , ' '

	# add new lines at the end of lines
	html.gsub! /<\/(p\|h\d\|dt\|dd\|dl)>/, '</\1>' + "\n"
	html.gsub! /<dl>/ , '<dl>' + "\n"

	html
	end
No results found