Created
August 27, 2010 07:01
-
-
Save rhulse/552955 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code is designed to cleanup Word HTML to a very great extent. | |
# It is used at Radio NZ to cleanup documents created in Word prior to | |
# parsing them, line by line, to extract content. | |
# The Word content is pasted into WYSIWYG which is then HTTP POSTed into | |
# the app where the string is cleaned by this code. | |
# We add new lines after block elements because the next stage is a line-based | |
# parser | |
# NB: The text you paste into your WYSIWYG should NOT have any smarttags. These | |
# can be stripped from a document by going to Tools : Autocorrect : Smart Tags, and | |
# clicking on "Remove Smart Tags. | |
# A basic test for smart tags (so you can reject it) is: | |
# def check_for_smarttags(html) | |
# html =~ %r{</o:smarttagtype>} | |
# end | |
# | |
# For an example of how it is used in practice see gist: http://gist.github.com/552971 | |
# | |
# Richard Hulse. 27 August 2010 | |
# Copyright (c) Radio New Zealand Limited 2010 | |
# MIT license | |
# Permission is hereby granted, free of charge, to any person obtaining | |
# a copy of this software and associated documentation files (the | |
# "Software"), to deal in the Software without restriction, including | |
# without limitation the rights to use, copy, modify, merge, publish, | |
# distribute, sublicense, and/or sell copies of the Software, and to | |
# permit persons to whom the Software is furnished to do so, subject to | |
# the following conditions: | |
# The above copyright notice and this permission notice shall be | |
# included in all copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
require 'rubygems' | |
require 'sanitize' | |
class ParserCore | |
def initialize | |
@error_messages = [] | |
end | |
# The elements and attributes to keep are in Sanitizer syntax | |
def tidy_html(dirty_html, elements = ['p', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'], attributes={}) | |
html = dirty_html.clone | |
html.gsub! /[\n|\r]/ , '' | |
# remove some word cruft | |
html.gsub! /<o:p>/i , '' | |
html.gsub! /<\/o:p>/i , '' | |
# fix some broken tags | |
html.gsub! /<\s+/i , '<' | |
# Tidy is used prior to santize as it is more robust in certain edge cases. | |
# You MAY be able to remove this. YMMV. | |
html = tidy(html) | |
# keep only the things we want. | |
html = strip_tags(html, elements, attributes ) | |
# tags with spaces must be reduced to a space (not butted up). | |
# if they are not then characters get run together | |
# e.g. '<p><b>this</b> <b>is a test.</b></p> | |
# MS word does output this sort of thing | |
html.gsub! /<i>\s+<\/i>/ , ' ' | |
html.gsub! /<b>\s+<\/b>/ , ' ' | |
html.gsub! /<\/b>\s+<b>/ , ' ' | |
# remove redundant empty tags | |
html.gsub! /<i><\/i>/ , '' | |
html.gsub! /<b><\/b>/ , '' | |
html.gsub! /<p><\/p>/ , '' | |
html.gsub! /<p><b><\/b><\/p>/ , '' | |
# butt up any remianing tags | |
html.gsub! / / , ' ' | |
html.gsub! />\s+</ , '><' | |
# add new lines at the end of block elements | |
html.gsub! /<\/(p|h\d|dt|dd|dl)>/ , '</\1>' + "\n" | |
html.gsub! /<dl>/ , '<dl>' + "\n" | |
html | |
end | |
def strip_tags(dirty_html, elements=[], attributes={}) | |
html = Sanitize.clean( dirty_html, :elements => elements, :attributes => attributes) | |
html.strip! | |
html | |
end | |
def strip_time(dirty_html) | |
text = dirty_html.clone | |
text.gsub! /\d{1,2}(\.|:)\d{2}/, '' | |
text.strip! | |
text | |
end | |
def tidy(dirty_html) | |
error_file = File.join(Rails.root, '/log/tidy_errors.log') | |
tidy_options = '--word-2000 1 --indent 0 --bare 1 --wrap 0 --show-body-only 1 --drop-empty-paras 1 --force-output yes -utf8' | |
cleaned = nil | |
tidy = IO.popen("tidy -f #{error_file} #{tidy_options}", 'w+') | |
begin | |
tidy.write(dirty_html) | |
tidy.close_write | |
cleaned_html = tidy.read | |
tidy.close_read | |
rescue Errno::EPIPE | |
$stderr.print "Running 'tidy' failed: " + $! | |
tidy.close | |
end | |
return cleaned_html if cleaned_html and cleaned_html != "" | |
return dirty_html | |
end | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment