dgreen · October 11, 2012 04:43 · dgreen · Oct 11, 2012
diff --git a/rtftomarkdown.rb b/rtftomarkdown.rb
 #!/usr/bin/ruby
 # Uses textutil, available on Mac only (installed by default)
 # Usage: rtftomarkdown.rb FILENAME.rtf
 # Outputs to STDOUT

 if ARGV.length == 0
  puts "#{__FILE__} expects an RTF input file as an argument"
  exit
 end

 def remove_empty(input)
  input.gsub!(/(<(\w+)( class=".*?")?>\s*<\/\2>)/,'')
  input = remove_empty(input) if input =~ /(<(\w+)( class=".*")?>\s*<\/\2>)/
  return input.strip
 end

 def indent_list(size)
  " " * 2 * size
 end

 ARGV.each do |infile|
  file = infile.sub(/\/$/,'')
  if File.exists?(File.expand_path(file))
    ext = file.match(/\.(\w+)$/)[1]
    
    input = %x{/usr/bin/textutil -convert html -stdout #{file}}.strip

    input.gsub!(/.*?<body>(.*?)<\/body>.*/m,"\\1")

    # remove span/br tags, unneccessary
    input.gsub!(/<br>/,'')
    input.gsub!(/<\/?span( class=".*?")?>/,'')

    # substitute headers
    input.gsub!(/<p class="p1"><b>(.+?)<\/b><\/p>/,'# \\1')
    input.gsub!(/<p class="p2"><b>(.+?)<\/b><\/p>/,'## \\1')
    input.gsub!(/<p class="p3"><b>(.+?)<\/b><\/p>/,'## \\1')
    input.gsub!(/<p class="p4"><b>(.+?)<\/b><\/p>/,'### \\1')
    input.gsub!(/<p class="p5"><b>(.+?)<\/b><\/p>/,'### \\1')

    # input = input.split("\n").map { |line|
    #   remove_empty(line)
    # }.join("\n")

    # remove paragraph tags
    input.gsub!(/<p class="p\d">(.*?)<\/p>/,'\\1')
    # emphasis
    input.gsub!(/<\/?b>/,'**')
    input.gsub!(/<\/?i>/,'*')

    input = input.split("\n").map { |line|
      line.strip
    }.join("\n")

    # Handle lists

    list_stack = []
    indent = ""

    input = input.split("\n").map { |line |

      # note textutil seems to output an unnumbered list even for numbered (ol) lists if fed .docx, use .rtf
      # not clear that ol or ul come across if .doc (sigh)
      if line =~ /<ul class="ul\d">/
        list_stack.push("*")
        ""
      elsif line =~ /<ol class="ol\d">/
        list_stack.push("1.")
        ""
      elsif line =~ /<\/ul>/
        list_stack.pop
        ""
      elsif line =~ /<\/ol>/
        list_stack.pop
        ""
      elsif line =~ /<li class="li\d">/
        line.gsub(/<li class="li\d">\S+\s*(.+?)<\/li>/, "#{indent_list(list_stack.size)}#{list_stack.fetch(-1)} \\1")
      else
        line
      end
    }.join("\n")

    puts input
  else
    puts "File not found: #{file}"
  end
 end
	#!/usr/bin/ruby
	# Uses textutil, available on Mac only (installed by default)
	# Usage: rtftomarkdown.rb FILENAME.rtf
	# Outputs to STDOUT

	if ARGV.length == 0
	puts "#{__FILE__} expects an RTF input file as an argument"
	exit
	end

	def remove_empty(input)
	input.gsub!(/(<(\w+)( class=".?")?>\s<\/\2>)/,'')
	input = remove_empty(input) if input =~ /(<(\w+)( class=".")?>\s<\/\2>)/
	return input.strip
	end

	def indent_list(size)
	" " * 2 * size
	end

	ARGV.each do \|infile\|
	file = infile.sub(/\/$/,'')
	if File.exists?(File.expand_path(file))
	ext = file.match(/\.(\w+)$/)[1]

	input = %x{/usr/bin/textutil -convert html -stdout #{file}}.strip

	input.gsub!(/.?<body>(.?)<\/body>.*/m,"\\1")

	# remove span/br tags, unneccessary
	input.gsub!(/<br>/,'')
	input.gsub!(/<\/?span( class=".*?")?>/,'')

	# substitute headers
	input.gsub!(/<p class="p1"><b>(.+?)<\/b><\/p>/,'# \\1')
	input.gsub!(/<p class="p2"><b>(.+?)<\/b><\/p>/,'## \\1')
	input.gsub!(/<p class="p3"><b>(.+?)<\/b><\/p>/,'## \\1')
	input.gsub!(/<p class="p4"><b>(.+?)<\/b><\/p>/,'### \\1')
	input.gsub!(/<p class="p5"><b>(.+?)<\/b><\/p>/,'### \\1')

	# input = input.split("\n").map { \|line\|
	# remove_empty(line)
	# }.join("\n")

	# remove paragraph tags
	input.gsub!(/<p class="p\d">(.*?)<\/p>/,'\\1')
	# emphasis
	input.gsub!(/<\/?b>/,'**')
	input.gsub!(/<\/?i>/,'*')

	input = input.split("\n").map { \|line\|
	line.strip
	}.join("\n")

	# Handle lists

	list_stack = []
	indent = ""

	input = input.split("\n").map { \|line \|

	# note textutil seems to output an unnumbered list even for numbered (ol) lists if fed .docx, use .rtf
	# not clear that ol or ul come across if .doc (sigh)
	if line =~ /<ul class="ul\d">/
	list_stack.push("*")
	""
	elsif line =~ /<ol class="ol\d">/
	list_stack.push("1.")
	""
	elsif line =~ /<\/ul>/
	list_stack.pop
	""
	elsif line =~ /<\/ol>/
	list_stack.pop
	""
	elsif line =~ /<li class="li\d">/
	line.gsub(/<li class="li\d">\S+\s*(.+?)<\/li>/, "#{indent_list(list_stack.size)}#{list_stack.fetch(-1)} \\1")
	else
	line
	end
	}.join("\n")

	puts input
	else
	puts "File not found: #{file}"
	end
	end