Last active
February 8, 2017 16:22
-
-
Save bofrede/4595889 to your computer and use it in GitHub Desktop.
Export Word documents as HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
=begin | |
This script requires a Ruby intepeter to run: | |
http://rubyinstaller.org/ | |
This script also requires Microsoft Windows and Microsoft Word to be installed. | |
A few libraries, used by this script: | |
HTML Sanitizer: | |
https://github.com/rgrove/sanitize/ | |
HTML parser and modifier: | |
http://nokogiri.org/ | |
Tk user interface: | |
http://www.tkdocs.com/tutorial/windows.html#dialogs | |
For more information on the Word Document class see: | |
http://msdn.microsoft.com/en-us/library/bb244898(v=office.12).aspx | |
Document.saveas http://msdn.microsoft.com/en-us/library/bb221597.aspx | |
Document.saveas2 http://msdn.microsoft.com/en-us/library/ff836084(v=office.14).aspx | |
msoEncoding values http://msdn.microsoft.com/en-us/library/office/aa432511(v=office.12).aspx | |
=end | |
begin | |
require 'win32ole' | |
require 'tk' | |
require 'sanitize' | |
rescue LoadError => le | |
puts "LoadError: #{le.message}" | |
puts "Run: gem install win32ole tk sanitize" | |
exit | |
end | |
WD_FORMAT = { | |
:document => 0, # Microsoft Office Word format. | |
:document_97 => 0, # Microsoft Word 97 document format. | |
:template => 1, # Word template format. | |
:template_97 => 1, # Word 97 template format. | |
:text => 2, # Microsoft Windows text format. | |
:text_line_breaks => 3, # Windows text format with line breaks preserved. | |
:dos_text => 4, # Microsoft DOS text format. | |
:dos_text_line_breaks => 5, # Microsoft DOS text with line breaks preserved. | |
:rtf => 6, # Rich text format (RTF). | |
:encoded_text => 7, # Encoded text format. | |
:unicode_text => 7, # Unicode text format. | |
:html => 8, # Standard HTML format. | |
:web_archive => 9, # Web archive format. | |
:filtered_html => 10, # Filtered HTML format. | |
:xml => 11, # Extensible Markup Language (XML) format. | |
:xml_document => 12, # XML document format. | |
:xml_document_macro_enabled => 13, # XML document format with macros enabled. | |
:xml_template => 14, # XML template format. | |
:xml_template_macro_enabled => 15, # XML template format with macros enabled. | |
:document_default => 16, # Word default document file format. For Microsoft Office Word 2007, this is the DOCX format. | |
:pdf => 17, # PDF format. | |
:xps => 18 # XPS format. | |
} # From: http://msdn.microsoft.com/en-us/library/bb238158(v=office.12).aspx | |
WHITE_LIST = { | |
:allow_comments => true, | |
:remove_contents => ['script', 'style'], | |
:elements => %w{ | |
html head title link meta body | |
h1 h2 h3 h4 h5 h6 p | |
dd dl dt li ol ul | |
caption col colgroup table tbody td tfoot th thead tr | |
a abbr b blockquote br cite code del dfn div em figcaption figure hgroup i img ins kbd mark | |
pre q rp rt ruby s samp small strike strong sub sup time var wbr | |
}, | |
:attributes => { | |
:all => ['title', 'id', 'class'], | |
'html' => ['lang'], | |
'meta' => ['http-equiv', 'name', 'content'], | |
'a' => ['href', 'name'], | |
'blockquote' => ['cite'], | |
'col' => ['span', 'width'], | |
'colgroup' => ['span', 'width'], | |
'del' => ['cite', 'datetime'], | |
'img' => ['align', 'alt', 'height', 'src', 'width'], | |
'ins' => ['cite', 'datetime'], | |
'ol' => ['start', 'reversed', 'type'], | |
'q' => ['cite'], | |
'table' => ['border', 'summary', 'width'], | |
'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'], | |
'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'], | |
'time' => ['datetime', 'pubdate'], | |
'ul' => ['type'] | |
}, | |
:protocols => { | |
'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]}, | |
'blockquote' => {'cite' => ['http', 'https', :relative]}, | |
'del' => {'cite' => ['http', 'https', :relative]}, | |
'img' => {'src' => ['http', 'https', :relative, 'data']}, | |
'ins' => {'cite' => ['http', 'https', :relative]}, | |
'q' => {'cite' => ['http', 'https', :relative]} | |
} | |
} | |
begin | |
word = WIN32OLE.new('Word.Application') | |
word.visible = false | |
word_file_name = Tk::getOpenFile(:filetypes => [['Word documents','*.doc?'], ['All files', '*.*']]) | |
if word_file_name | |
word_file_name.gsub!(/\//, "\\") # Forward slashes in file names with spaces cause: "OLE error code:800A1436 in Microsoft Word" | |
puts "Converting: #{word_file_name}" | |
word_document = word.documents.Open(word_file_name) | |
if word_document.nil? | |
puts ' File not found! Probably due to spaces in the file path.' | |
else | |
html_file_name = word_file_name.sub(/(.*)\..*$/, '\1_raw.html') | |
puts "Saving as #{html_file_name}" | |
word_document.saveas({'FileName' => html_file_name, 'FileFormat' => WD_FORMAT[:filtered_html], 'Encoding' => 65001}) # Encoding is ignored! | |
word_document.close() | |
# Reopen html file, using the same charset Word used to save it. | |
puts "Reading HTML from #{html_file_name}" | |
html_file = File.open(html_file_name, "r:windows-1252:utf-8") | |
puts "HTML file encoding #{html_file.external_encoding.name}" | |
html = '<!DOCTYPE html>' + html_file.read() | |
puts 'Sanitizing' | |
html_document = Nokogiri::HTML::Document.parse(html) | |
Sanitize.new(WHITE_LIST).clean_node!(html_document) | |
html_document.css('html').first['lang'] = 'en-US' | |
html_document.css('meta[name="Generator"]').first.remove() | |
# Remove page numbers from TOC | |
html_document.css('.MsoToc1 a, .MsoToc2 a').each do |item| | |
item.inner_html = item.inner_text.sub(/(\s+\d+)\Z/, '') | |
end | |
# Remove Words "normal" classes. | |
UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoHeader Templatehelp | |
TOCEntry Indent1 MsoCaption MsoListParagraph | |
MsoNormalTable MsoTableGrid MsoTableClassic1} | |
UNWANTED_CLASSES.each do |class_name| | |
html_document.css(".#{class_name}").each do |node| | |
node.remove_attribute('class') | |
end | |
end | |
# Remove abandend anchors, that are not linked to. | |
html_document.css('a[name]').each do |a| | |
if html_document.css('a[href="#' + a['name'] + '"]').size == 0 | |
puts "<a name=\"#{a['name']}\"> was removed." | |
a.replace(a.inner_html) | |
end | |
end | |
sanitized_html = html_document.to_html({:encoding => 'UTF-8', :indent => 0}) | |
# write output to (new) file | |
sanitized_html_file_name = word_file_name.sub(/(.*)\..*$/, '\1.html') | |
puts "Writing sanitized HTML file: #{sanitized_html_file_name}" | |
File.open(sanitized_html_file_name, 'w:UTF-8') do |f| | |
f.write sanitized_html | |
end | |
puts 'Done.' | |
end | |
end | |
rescue WIN32OLERuntimeError => rte | |
puts "Error: #{rte.message}" | |
ensure | |
word.quit() unless word.nil? | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment