bofrede · February 8, 2017 16:22
diff --git a/html_export.rb b/html_export.rb
 =begin
 This script requires a Ruby intepeter to run:
 http://rubyinstaller.org/

 This script also requires Microsoft Windows and Microsoft Word to be installed.

 A few libraries, used by this script:

 HTML Sanitizer:
 https://github.com/rgrove/sanitize/

 HTML parser and modifier:
 http://nokogiri.org/

 Tk user interface:
 http://www.tkdocs.com/tutorial/windows.html#dialogs

 For more information on the Word Document class see:
 http://msdn.microsoft.com/en-us/library/bb244898(v=office.12).aspx
 Document.saveas  http://msdn.microsoft.com/en-us/library/bb221597.aspx
 Document.saveas2 http://msdn.microsoft.com/en-us/library/ff836084(v=office.14).aspx
 msoEncoding values http://msdn.microsoft.com/en-us/library/office/aa432511(v=office.12).aspx
 =end

 begin
  require 'win32ole'
  require 'tk'
  require 'sanitize'
 rescue LoadError => le
  puts "LoadError: #{le.message}"
  puts "Run: gem install win32ole tk sanitize"
  exit
 end

 WD_FORMAT = {
  :document                   =>  0, # Microsoft Office Word format.
  :document_97                =>  0, # Microsoft Word 97 document format.
  :template                   =>  1, # Word template format.
  :template_97                =>  1, # Word 97 template format.
  :text                       =>  2, # Microsoft Windows text format.
  :text_line_breaks           =>  3, # Windows text format with line breaks preserved.
  :dos_text                   =>  4, # Microsoft DOS text format.
  :dos_text_line_breaks       =>  5, # Microsoft DOS text with line breaks preserved.
  :rtf                        =>  6, # Rich text format (RTF).
  :encoded_text               =>  7, # Encoded text format.
  :unicode_text               =>  7, # Unicode text format.
  :html                       =>  8, # Standard HTML format.
  :web_archive                =>  9, # Web archive format.
  :filtered_html              => 10, # Filtered HTML format.
  :xml                        => 11, # Extensible Markup Language (XML) format.
  :xml_document               => 12, # XML document format.
  :xml_document_macro_enabled => 13, # XML document format with macros enabled.
  :xml_template               => 14, # XML template format.
  :xml_template_macro_enabled => 15, # XML template format with macros enabled.
  :document_default           => 16, # Word default document file format. For Microsoft Office Word 2007, this is the DOCX format.
  :pdf                        => 17, # PDF format.
  :xps                        => 18  # XPS format.
 } # From: http://msdn.microsoft.com/en-us/library/bb238158(v=office.12).aspx

 WHITE_LIST = {
  :allow_comments => true,
  :remove_contents => ['script', 'style'],
  :elements => %w{
    html head title link meta body
    h1 h2 h3 h4 h5 h6 p
    dd dl dt li ol ul
    caption col colgroup table tbody td tfoot th thead tr
    a abbr b blockquote br cite code del dfn div em figcaption figure hgroup i img ins kbd mark
    pre q rp rt ruby s samp small strike strong sub sup time var wbr
  },
  :attributes => {
    :all         => ['title', 'id', 'class'],
    'html'       => ['lang'],
    'meta'       => ['http-equiv', 'name', 'content'],
    'a'          => ['href', 'name'],
    'blockquote' => ['cite'],
    'col'        => ['span', 'width'],
    'colgroup'   => ['span', 'width'],
    'del'        => ['cite', 'datetime'],
    'img'        => ['align', 'alt', 'height', 'src', 'width'],
    'ins'        => ['cite', 'datetime'],
    'ol'         => ['start', 'reversed', 'type'],
    'q'          => ['cite'],
    'table'      => ['border', 'summary', 'width'],
    'td'         => ['abbr', 'axis', 'colspan', 'rowspan', 'width'],
    'th'         => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'],
    'time'       => ['datetime', 'pubdate'],
    'ul'         => ['type']
  },
  :protocols => {
    'a'          => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
    'blockquote' => {'cite' => ['http', 'https', :relative]},
    'del'        => {'cite' => ['http', 'https', :relative]},
    'img'        => {'src'  => ['http', 'https', :relative, 'data']},
    'ins'        => {'cite' => ['http', 'https', :relative]},
    'q'          => {'cite' => ['http', 'https', :relative]}
  }
 }


 begin
  word = WIN32OLE.new('Word.Application')
  word.visible = false
  word_file_name = Tk::getOpenFile(:filetypes => [['Word documents','*.doc?'], ['All files', '*.*']])
  if word_file_name
    word_file_name.gsub!(/\//, "\\") # Forward slashes in file names with spaces cause: "OLE error code:800A1436 in Microsoft Word"
    puts "Converting: #{word_file_name}"
    word_document = word.documents.Open(word_file_name)
    if word_document.nil?
      puts '  File not found! Probably due to spaces in the file path.'
    else
      html_file_name = word_file_name.sub(/(.*)\..*$/, '\1_raw.html')
      puts "Saving as #{html_file_name}"
      word_document.saveas({'FileName' => html_file_name, 'FileFormat' => WD_FORMAT[:filtered_html], 'Encoding' => 65001}) # Encoding is ignored!
      word_document.close()
      # Reopen html file, using the same charset Word used to save it.
      puts "Reading HTML from #{html_file_name}"
      html_file = File.open(html_file_name, "r:windows-1252:utf-8")
      puts "HTML file encoding #{html_file.external_encoding.name}"
      html = '<!DOCTYPE html>' + html_file.read()
      puts 'Sanitizing'
      html_document = Nokogiri::HTML::Document.parse(html)
      Sanitize.new(WHITE_LIST).clean_node!(html_document)
      html_document.css('html').first['lang'] = 'en-US'
      html_document.css('meta[name="Generator"]').first.remove()
      # Remove page numbers from TOC
      html_document.css('.MsoToc1 a, .MsoToc2 a').each do |item|
        item.inner_html = item.inner_text.sub(/(\s+\d+)\Z/, '')
      end
      # Remove Words "normal" classes.
      UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoHeader Templatehelp 
                            TOCEntry Indent1 MsoCaption MsoListParagraph 
                            MsoNormalTable MsoTableGrid MsoTableClassic1}
      UNWANTED_CLASSES.each do |class_name|
        html_document.css(".#{class_name}").each do |node|
          node.remove_attribute('class')
        end
      end
      # Remove abandend anchors, that are not linked to.
      html_document.css('a[name]').each do |a|
        if html_document.css('a[href="#' + a['name'] + '"]').size == 0
          puts "<a name=\"#{a['name']}\"> was removed."
          a.replace(a.inner_html)
       end
      end
      sanitized_html = html_document.to_html({:encoding => 'UTF-8', :indent => 0})
      # write output to (new) file
      sanitized_html_file_name = word_file_name.sub(/(.*)\..*$/, '\1.html')
      puts "Writing sanitized HTML file: #{sanitized_html_file_name}"
      File.open(sanitized_html_file_name, 'w:UTF-8') do |f|
        f.write sanitized_html
      end
      puts 'Done.'
    end
  end
 rescue WIN32OLERuntimeError => rte
  puts "Error: #{rte.message}"
 ensure
  word.quit() unless word.nil?
 end
	=begin
	This script requires a Ruby intepeter to run:
	http://rubyinstaller.org/

	This script also requires Microsoft Windows and Microsoft Word to be installed.

	A few libraries, used by this script:

	HTML Sanitizer:
	https://github.com/rgrove/sanitize/

	HTML parser and modifier:
	http://nokogiri.org/

	Tk user interface:
	http://www.tkdocs.com/tutorial/windows.html#dialogs

	For more information on the Word Document class see:
	http://msdn.microsoft.com/en-us/library/bb244898(v=office.12).aspx
	Document.saveas http://msdn.microsoft.com/en-us/library/bb221597.aspx
	Document.saveas2 http://msdn.microsoft.com/en-us/library/ff836084(v=office.14).aspx
	msoEncoding values http://msdn.microsoft.com/en-us/library/office/aa432511(v=office.12).aspx
	=end

	begin
	require 'win32ole'
	require 'tk'
	require 'sanitize'
	rescue LoadError => le
	puts "LoadError: #{le.message}"
	puts "Run: gem install win32ole tk sanitize"
	exit
	end

	WD_FORMAT = {
	:document => 0, # Microsoft Office Word format.
	:document_97 => 0, # Microsoft Word 97 document format.
	:template => 1, # Word template format.
	:template_97 => 1, # Word 97 template format.
	:text => 2, # Microsoft Windows text format.
	:text_line_breaks => 3, # Windows text format with line breaks preserved.
	:dos_text => 4, # Microsoft DOS text format.
	:dos_text_line_breaks => 5, # Microsoft DOS text with line breaks preserved.
	:rtf => 6, # Rich text format (RTF).
	:encoded_text => 7, # Encoded text format.
	:unicode_text => 7, # Unicode text format.
	:html => 8, # Standard HTML format.
	:web_archive => 9, # Web archive format.
	:filtered_html => 10, # Filtered HTML format.
	:xml => 11, # Extensible Markup Language (XML) format.
	:xml_document => 12, # XML document format.
	:xml_document_macro_enabled => 13, # XML document format with macros enabled.
	:xml_template => 14, # XML template format.
	:xml_template_macro_enabled => 15, # XML template format with macros enabled.
	:document_default => 16, # Word default document file format. For Microsoft Office Word 2007, this is the DOCX format.
	:pdf => 17, # PDF format.
	:xps => 18 # XPS format.
	} # From: http://msdn.microsoft.com/en-us/library/bb238158(v=office.12).aspx

	WHITE_LIST = {
	:allow_comments => true,
	:remove_contents => ['script', 'style'],
	:elements => %w{
	html head title link meta body
	h1 h2 h3 h4 h5 h6 p
	dd dl dt li ol ul
	caption col colgroup table tbody td tfoot th thead tr
	a abbr b blockquote br cite code del dfn div em figcaption figure hgroup i img ins kbd mark
	pre q rp rt ruby s samp small strike strong sub sup time var wbr
	},
	:attributes => {
	:all => ['title', 'id', 'class'],
	'html' => ['lang'],
	'meta' => ['http-equiv', 'name', 'content'],
	'a' => ['href', 'name'],
	'blockquote' => ['cite'],
	'col' => ['span', 'width'],
	'colgroup' => ['span', 'width'],
	'del' => ['cite', 'datetime'],
	'img' => ['align', 'alt', 'height', 'src', 'width'],
	'ins' => ['cite', 'datetime'],
	'ol' => ['start', 'reversed', 'type'],
	'q' => ['cite'],
	'table' => ['border', 'summary', 'width'],
	'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'],
	'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'],
	'time' => ['datetime', 'pubdate'],
	'ul' => ['type']
	},
	:protocols => {
	'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]},
	'blockquote' => {'cite' => ['http', 'https', :relative]},
	'del' => {'cite' => ['http', 'https', :relative]},
	'img' => {'src' => ['http', 'https', :relative, 'data']},
	'ins' => {'cite' => ['http', 'https', :relative]},
	'q' => {'cite' => ['http', 'https', :relative]}
	}
	}


	begin
	word = WIN32OLE.new('Word.Application')
	word.visible = false
	word_file_name = Tk::getOpenFile(:filetypes => [['Word documents','.doc?'], ['All files', '.*']])
	if word_file_name
	word_file_name.gsub!(/\//, "\\") # Forward slashes in file names with spaces cause: "OLE error code:800A1436 in Microsoft Word"
	puts "Converting: #{word_file_name}"
	word_document = word.documents.Open(word_file_name)
	if word_document.nil?
	puts ' File not found! Probably due to spaces in the file path.'
	else
	html_file_name = word_file_name.sub(/(.)\..$/, '\1_raw.html')
	puts "Saving as #{html_file_name}"
	word_document.saveas({'FileName' => html_file_name, 'FileFormat' => WD_FORMAT[:filtered_html], 'Encoding' => 65001}) # Encoding is ignored!
	word_document.close()
	# Reopen html file, using the same charset Word used to save it.
	puts "Reading HTML from #{html_file_name}"
	html_file = File.open(html_file_name, "r:windows-1252:utf-8")
	puts "HTML file encoding #{html_file.external_encoding.name}"
	html = '<!DOCTYPE html>' + html_file.read()
	puts 'Sanitizing'
	html_document = Nokogiri::HTML::Document.parse(html)
	Sanitize.new(WHITE_LIST).clean_node!(html_document)
	html_document.css('html').first['lang'] = 'en-US'
	html_document.css('meta[name="Generator"]').first.remove()
	# Remove page numbers from TOC
	html_document.css('.MsoToc1 a, .MsoToc2 a').each do \|item\|
	item.inner_html = item.inner_text.sub(/(\s+\d+)\Z/, '')
	end
	# Remove Words "normal" classes.
	UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoHeader Templatehelp
	TOCEntry Indent1 MsoCaption MsoListParagraph
	MsoNormalTable MsoTableGrid MsoTableClassic1}
	UNWANTED_CLASSES.each do \|class_name\|
	html_document.css(".#{class_name}").each do \|node\|
	node.remove_attribute('class')
	end
	end
	# Remove abandend anchors, that are not linked to.
	html_document.css('a[name]').each do \|a\|
	if html_document.css('a[href="#' + a['name'] + '"]').size == 0
	puts "<a name=\"#{a['name']}\"> was removed."
	a.replace(a.inner_html)
	end
	end
	sanitized_html = html_document.to_html({:encoding => 'UTF-8', :indent => 0})
	# write output to (new) file
	sanitized_html_file_name = word_file_name.sub(/(.)\..$/, '\1.html')
	puts "Writing sanitized HTML file: #{sanitized_html_file_name}"
	File.open(sanitized_html_file_name, 'w:UTF-8') do \|f\|
	f.write sanitized_html
	end
	puts 'Done.'
	end
	end
	rescue WIN32OLERuntimeError => rte
	puts "Error: #{rte.message}"
	ensure
	word.quit() unless word.nil?
	end