=begin This script requires a Ruby intepeter to run: http://rubyinstaller.org/ This script also requires Microsoft Windows and Microsoft Word to be installed. A few libraries, used by this script: HTML Sanitizer: https://github.com/rgrove/sanitize/ HTML parser and modifier: http://nokogiri.org/ Tk user interface: http://www.tkdocs.com/tutorial/windows.html#dialogs For more information on the Word Document class see: http://msdn.microsoft.com/en-us/library/bb244898(v=office.12).aspx Document.saveas http://msdn.microsoft.com/en-us/library/bb221597.aspx Document.saveas2 http://msdn.microsoft.com/en-us/library/ff836084(v=office.14).aspx msoEncoding values http://msdn.microsoft.com/en-us/library/office/aa432511(v=office.12).aspx =end begin require 'win32ole' require 'tk' require 'sanitize' rescue LoadError => le puts "LoadError: #{le.message}" puts "Run: gem install win32ole tk sanitize" exit end WD_FORMAT = { :document => 0, # Microsoft Office Word format. :document_97 => 0, # Microsoft Word 97 document format. :template => 1, # Word template format. :template_97 => 1, # Word 97 template format. :text => 2, # Microsoft Windows text format. :text_line_breaks => 3, # Windows text format with line breaks preserved. :dos_text => 4, # Microsoft DOS text format. :dos_text_line_breaks => 5, # Microsoft DOS text with line breaks preserved. :rtf => 6, # Rich text format (RTF). :encoded_text => 7, # Encoded text format. :unicode_text => 7, # Unicode text format. :html => 8, # Standard HTML format. :web_archive => 9, # Web archive format. :filtered_html => 10, # Filtered HTML format. :xml => 11, # Extensible Markup Language (XML) format. :xml_document => 12, # XML document format. :xml_document_macro_enabled => 13, # XML document format with macros enabled. :xml_template => 14, # XML template format. :xml_template_macro_enabled => 15, # XML template format with macros enabled. :document_default => 16, # Word default document file format. For Microsoft Office Word 2007, this is the DOCX format. :pdf => 17, # PDF format. :xps => 18 # XPS format. } # From: http://msdn.microsoft.com/en-us/library/bb238158(v=office.12).aspx WHITE_LIST = { :allow_comments => true, :remove_contents => ['script', 'style'], :elements => %w{ html head title link meta body h1 h2 h3 h4 h5 h6 p dd dl dt li ol ul caption col colgroup table tbody td tfoot th thead tr a abbr b blockquote br cite code del dfn div em figcaption figure hgroup i img ins kbd mark pre q rp rt ruby s samp small strike strong sub sup time var wbr }, :attributes => { :all => ['title', 'id', 'class'], 'html' => ['lang'], 'meta' => ['http-equiv', 'name', 'content'], 'a' => ['href', 'name'], 'blockquote' => ['cite'], 'col' => ['span', 'width'], 'colgroup' => ['span', 'width'], 'del' => ['cite', 'datetime'], 'img' => ['align', 'alt', 'height', 'src', 'width'], 'ins' => ['cite', 'datetime'], 'ol' => ['start', 'reversed', 'type'], 'q' => ['cite'], 'table' => ['border', 'summary', 'width'], 'td' => ['abbr', 'axis', 'colspan', 'rowspan', 'width'], 'th' => ['abbr', 'axis', 'colspan', 'rowspan', 'scope', 'width'], 'time' => ['datetime', 'pubdate'], 'ul' => ['type'] }, :protocols => { 'a' => {'href' => ['ftp', 'http', 'https', 'mailto', :relative]}, 'blockquote' => {'cite' => ['http', 'https', :relative]}, 'del' => {'cite' => ['http', 'https', :relative]}, 'img' => {'src' => ['http', 'https', :relative, 'data']}, 'ins' => {'cite' => ['http', 'https', :relative]}, 'q' => {'cite' => ['http', 'https', :relative]} } } begin word = WIN32OLE.new('Word.Application') word.visible = false word_file_name = Tk::getOpenFile(:filetypes => [['Word documents','*.doc?'], ['All files', '*.*']]) if word_file_name word_file_name.gsub!(/\//, "\\") # Forward slashes in file names with spaces cause: "OLE error code:800A1436 in Microsoft Word" puts "Converting: #{word_file_name}" word_document = word.documents.Open(word_file_name) if word_document.nil? puts ' File not found! Probably due to spaces in the file path.' else html_file_name = word_file_name.sub(/(.*)\..*$/, '\1_raw.html') puts "Saving as #{html_file_name}" word_document.saveas({'FileName' => html_file_name, 'FileFormat' => WD_FORMAT[:filtered_html], 'Encoding' => 65001}) # Encoding is ignored! word_document.close() # Reopen html file, using the same charset Word used to save it. puts "Reading HTML from #{html_file_name}" html_file = File.open(html_file_name, "r:windows-1252:utf-8") puts "HTML file encoding #{html_file.external_encoding.name}" html = '' + html_file.read() puts 'Sanitizing' html_document = Nokogiri::HTML::Document.parse(html) Sanitize.new(WHITE_LIST).clean_node!(html_document) html_document.css('html').first['lang'] = 'en-US' html_document.css('meta[name="Generator"]').first.remove() # Remove page numbers from TOC html_document.css('.MsoToc1 a, .MsoToc2 a').each do |item| item.inner_html = item.inner_text.sub(/(\s+\d+)\Z/, '') end # Remove Words "normal" classes. UNWANTED_CLASSES = %w{MsoNormal MsoBodyText NormalBold MsoHeader Templatehelp TOCEntry Indent1 MsoCaption MsoListParagraph MsoNormalTable MsoTableGrid MsoTableClassic1} UNWANTED_CLASSES.each do |class_name| html_document.css(".#{class_name}").each do |node| node.remove_attribute('class') end end # Remove abandend anchors, that are not linked to. html_document.css('a[name]').each do |a| if html_document.css('a[href="#' + a['name'] + '"]').size == 0 puts " was removed." a.replace(a.inner_html) end end sanitized_html = html_document.to_html({:encoding => 'UTF-8', :indent => 0}) # write output to (new) file sanitized_html_file_name = word_file_name.sub(/(.*)\..*$/, '\1.html') puts "Writing sanitized HTML file: #{sanitized_html_file_name}" File.open(sanitized_html_file_name, 'w:UTF-8') do |f| f.write sanitized_html end puts 'Done.' end end rescue WIN32OLERuntimeError => rte puts "Error: #{rte.message}" ensure word.quit() unless word.nil? end