Snarp · April 23, 2020 00:56
diff --git a/ai_transcript_cleanup.rb b/ai_transcript_cleanup.rb
 require 'nokogiri'

 # (Unless you are the person I am sending this to, you do not need it.)
 # 
 # Steps to use:
 # 1. Export a Google Docs file to HTML.
 # 2. Unzip the HTML file and put the filename below.
 # 3. Edit css_pass (further down) to correctly convert 'c{SOME NUMBER}'-class elements to <i>, <b>, or <u>.
 # 4. Run cleanup()

 @raw_fname  ='FILENAME GOES HERE'
 @work_fname = @raw_fname.sub('.html', '_WORK.html')
 @html_fname = @raw_fname.sub('.html', '_EDIT.html')
 @md_fname   = @raw_fname.sub('.html', '.md')

 @fname=@work_fname


 def cleanup(in_fname=@raw_fname, out_fname=@md_fname)
  first_pass(@raw_fname, @work_fname)
  second_pass(@work_fname, @html_fname)
  final_pass(@html_fname, @md_fname)
 end



 def first_pass(in_fname=@raw_fname, out_fname=@work_fname)
  txt=readability_pass(File.read(in_fname))

  File.write(out_fname, txt)
  return txt
 end

 def second_pass(in_fname=@work_fname, out_fname=@html_fname)
  txt = css_pass(File.read(in_fname))
  txt = line_cleaning_pass(txt).strip

  File.write(out_fname, txt)
  return txt
 end

 def final_pass(in_fname=@html_fname, out_fname=@md_fname)
  txt = md_pass(File.read(in_fname)).strip

  File.write(out_fname, txt)
  return txt
 end




 def readability_pass(txt)
  {
    /<body[^>]*>/ => "<body>\n", 
    "}"           => "}\n", 

    "&nbsp;"      => " ", 
    "\t"          => " ", 
    "&Tab;"       => " ", 
    "\r"          => "\n", 
    "&NewLine;"   => "\n", 
    /<\/?p>/      => "\n", 
    /<br\/?>/     => "\n", 

    "“"           => "\"", 
    "”"           => "\"", 
    "&ldquo;"     => "\"", 
    "&rdquo;"     => "\"", 
    "&quot;"      => "\"", 
    "&QUOT;"      => "\"", 
    "‘"           => "'", 
    "’"           => "'", 
    "&rsquo;"     => "'", 
    "&lsquo;"     => "'", 
    "&apos;"      => "'", 
    "&#39;"       => "'", 
    "—"           => "--", 
    "&mdash;"     => "--", 
    "&horbar;"    => "--", 
    "&ndash;"     => "--", 
    "&dash;"      => "-", 
    "…"           => "...", 
    "&hellip;"    => "...", 
    "&mldr;"      => "...", 
    "&nldr;"      => "...", 
    "&amp;"       => "&", 
    "&AMP;"       => "&", 
    "&copy;"      => "(c)", 
    "&reg;"       => "(R)", 

    '<style type="text/css">' => "\n<style type=\"text/css\">\n", 
    '</style>'                => "\n</style>\n", 

    /\n{3,}/      => "\n\n", 
  }.each { |k,v| txt.gsub!(k,v) }

  return txt.strip
 end

 def css_pass(txt)
  {
    /c(4)/         => "italic", 
    /c(6)/         => "bold", 
    # /c(4)/         => "underline", 
    /c(\d+)/       => "", 
    / class="\s*"/ => "", 
  }.each { |k,v| txt.gsub!(k,v) }

  doc = Nokogiri::HTML(txt)
  doc = excise_unstyled_spans(doc)
  txt = doc.to_html

  return txt
 end

 def line_cleaning_pass(txt)
  {
    "\n</p>"      => "</p>", 
    /<p>\s*<\/p>/ => "{{BREAK}}", 
  }.each { |k,v| txt.gsub!(k,v) }

  # indents = {
  #   / {0,0}/   => "caption",  # (no leading whitespace)
  #   / {30,30}/ => "speaker", 
  #   / {20,25}/ => "dialog", 
  #   / {10,10}/ => "caption", 
  # }

  # while m= /<p>(?<caption>\S[^<]*\s*)<\/p>/.match(txt)
  while m= /<p>(?<caption>\S[^\n]*\s*)<\/p>/.match(txt)
    txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
  end
  # while m= /<p> {30,30}(?<name>[^<]*\s*)<\/p>/.match(txt)
  while m= /<p> {30,30}(?<name>[^\n]*\s*)<\/p>/.match(txt)
    txt.gsub!(m[0], "<p class=\"speaker\">#{m[:name].strip}</p>\n{{BREAK}}")
  end
  # while m= /<p> {20,25}(?<dialog>[^<]*\s*)<\/p>/.match(txt)
  while m= /<p> {20,25}(?<dialog>[^\n]*\s*)<\/p>/.match(txt)
    txt.gsub!(m[0], "<p class=\"dialog\">#{m[:dialog].strip}</p>")
  end
  # while m= /<p> {10,10}(?<caption>[^<]*\s*)<\/p>/.match(txt)
  while m= /<p> {10,10}(?<caption>[^\n]*\s*)<\/p>/.match(txt)
    txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
  end

  doc = Nokogiri::HTML(txt)
  doc.css('p').each { |p| p.inner_html=p.inner_html.strip }
  doc.css('span[class]').each do |span|
    classes = span['class'].strip.split(' ')
    if    classes==['italic']
      span.add_previous_sibling("<i>#{span.inner_html}</i>")
      span.remove
    elsif [['bold'], ['underline']].include?(classes)
      span.add_previous_sibling("<b>#{span.inner_html}</b>")
      span.remove
    elsif classes.count > 1
      span.add_previous_sibling("<i><b>#{span.inner_html}</b></i>")
      span.remove
    end
  end

  txt = doc.to_html
  {
    /<\/i>\s+<i>/     => ' ', 
    /<\/b>\s+<b>/     => ' ', 
    '</i><i>'         => '', 
    '</b><b>'         => '', 
    /<\/p>\n<p[^>]*>/ => " ", 
    "\n{{BREAK}}"     => "\n", 
    /\n{3,}/          => "\n\n", 
  }.each { |k,v| txt.gsub!(k,v) }

  doc = Nokogiri::HTML(txt)
  doc.css('p[class="caption"]').each do |p|
    p.inner_html="[#{p.inner_html.strip}]"
  end
  txt = doc.to_html

  return txt
 end

 def md_pass(txt)
  doc = Nokogiri::HTML(txt)
  txt = doc.at_css('body').inner_html.strip

  {
    '*'                   => '\*', 
    '_'                   => '\_', 
    "\n-"                 => "\n\-", 
    /<\/?i>/              => '*', 
    /<\/?b>/              => '__', 
    '<p class="speaker">' => '#### ', 
    '<p class="caption">' => '##### ', 
    /<p[^>]*>/            => '', 
    '</p>'                => '', 
    '('                   => '_(', 
    ')'                   => ')_', 
    '_(STATEMENT)_'       => '(STATEMENT)', 
  }.each { |k,v| txt.gsub!(k,v) }

  return txt
 end



 # HELPERS

 def excise_unstyled_spans(doc)
  doc = Nokogiri::HTML(doc) if doc.is_a?(String)
  doc.css('span:not([class])', 'span[class=""]').each do |span|
    span.add_previous_sibling(span.inner_html)
    span.remove
  end
  return doc
 end
	require 'nokogiri'

	# (Unless you are the person I am sending this to, you do not need it.)
	#
	# Steps to use:
	# 1. Export a Google Docs file to HTML.
	# 2. Unzip the HTML file and put the filename below.
	# 3. Edit css_pass (further down) to correctly convert 'c{SOME NUMBER}'-class elements to <i>, <b>, or <u>.
	# 4. Run cleanup()

	@raw_fname ='FILENAME GOES HERE'
	@work_fname = @raw_fname.sub('.html', '_WORK.html')
	@html_fname = @raw_fname.sub('.html', '_EDIT.html')
	@md_fname = @raw_fname.sub('.html', '.md')

	@fname=@work_fname


	def cleanup(in_fname=@raw_fname, out_fname=@md_fname)
	first_pass(@raw_fname, @work_fname)
	second_pass(@work_fname, @html_fname)
	final_pass(@html_fname, @md_fname)
	end



	def first_pass(in_fname=@raw_fname, out_fname=@work_fname)
	txt=readability_pass(File.read(in_fname))

	File.write(out_fname, txt)
	return txt
	end

	def second_pass(in_fname=@work_fname, out_fname=@html_fname)
	txt = css_pass(File.read(in_fname))
	txt = line_cleaning_pass(txt).strip

	File.write(out_fname, txt)
	return txt
	end

	def final_pass(in_fname=@html_fname, out_fname=@md_fname)
	txt = md_pass(File.read(in_fname)).strip

	File.write(out_fname, txt)
	return txt
	end




	def readability_pass(txt)
	{
	/<body[^>]*>/ => "<body>\n",
	"}" => "}\n",

	" " => " ",
	"\t" => " ",
	"&Tab;" => " ",
	"\r" => "\n",
	"&NewLine;" => "\n",
	/<\/?p>/ => "\n",
	/<br\/?>/ => "\n",

	"“" => "\"",
	"”" => "\"",
	"“" => "\"",
	"”" => "\"",
	""" => "\"",
	"&QUOT;" => "\"",
	"‘" => "'",
	"’" => "'",
	"’" => "'",
	"‘" => "'",
	"'" => "'",
	"'" => "'",
	"—" => "--",
	"—" => "--",
	"&horbar;" => "--",
	"–" => "--",
	"&dash;" => "-",
	"…" => "...",
	"…" => "...",
	"&mldr;" => "...",
	"&nldr;" => "...",
	"&" => "&",
	"&AMP;" => "&",
	"©" => "(c)",
	"®" => "(R)",

	'<style type="text/css">' => "\n<style type=\"text/css\">\n",
	'</style>' => "\n</style>\n",

	/\n{3,}/ => "\n\n",
	}.each { \|k,v\| txt.gsub!(k,v) }

	return txt.strip
	end

	def css_pass(txt)
	{
	/c(4)/ => "italic",
	/c(6)/ => "bold",
	# /c(4)/ => "underline",
	/c(\d+)/ => "",
	/ class="\s*"/ => "",
	}.each { \|k,v\| txt.gsub!(k,v) }

	doc = Nokogiri::HTML(txt)
	doc = excise_unstyled_spans(doc)
	txt = doc.to_html

	return txt
	end

	def line_cleaning_pass(txt)
	{
	"\n</p>" => "</p>",
	/<p>\s*<\/p>/ => "{{BREAK}}",
	}.each { \|k,v\| txt.gsub!(k,v) }

	# indents = {
	# / {0,0}/ => "caption", # (no leading whitespace)
	# / {30,30}/ => "speaker",
	# / {20,25}/ => "dialog",
	# / {10,10}/ => "caption",
	# }

	# while m= /<p>(?<caption>\S[^<]\s)<\/p>/.match(txt)
	while m= /<p>(?<caption>\S[^\n]\s)<\/p>/.match(txt)
	txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
	end
	# while m= /<p> {30,30}(?<name>[^<]\s)<\/p>/.match(txt)
	while m= /<p> {30,30}(?<name>[^\n]\s)<\/p>/.match(txt)
	txt.gsub!(m[0], "<p class=\"speaker\">#{m[:name].strip}</p>\n{{BREAK}}")
	end
	# while m= /<p> {20,25}(?<dialog>[^<]\s)<\/p>/.match(txt)
	while m= /<p> {20,25}(?<dialog>[^\n]\s)<\/p>/.match(txt)
	txt.gsub!(m[0], "<p class=\"dialog\">#{m[:dialog].strip}</p>")
	end
	# while m= /<p> {10,10}(?<caption>[^<]\s)<\/p>/.match(txt)
	while m= /<p> {10,10}(?<caption>[^\n]\s)<\/p>/.match(txt)
	txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
	end

	doc = Nokogiri::HTML(txt)
	doc.css('p').each { \|p\| p.inner_html=p.inner_html.strip }
	doc.css('span[class]').each do \|span\|
	classes = span['class'].strip.split(' ')
	if classes==['italic']
	span.add_previous_sibling("<i>#{span.inner_html}</i>")
	span.remove
	elsif [['bold'], ['underline']].include?(classes)
	span.add_previous_sibling("<b>#{span.inner_html}</b>")
	span.remove
	elsif classes.count > 1
	span.add_previous_sibling("<i><b>#{span.inner_html}</b></i>")
	span.remove
	end
	end

	txt = doc.to_html
	{
	/<\/i>\s+<i>/ => ' ',
	/<\/b>\s+<b>/ => ' ',
	'</i><i>' => '',
	'</b><b>' => '',
	/<\/p>\n<p[^>]*>/ => " ",
	"\n{{BREAK}}" => "\n",
	/\n{3,}/ => "\n\n",
	}.each { \|k,v\| txt.gsub!(k,v) }

	doc = Nokogiri::HTML(txt)
	doc.css('p[class="caption"]').each do \|p\|
	p.inner_html="[#{p.inner_html.strip}]"
	end
	txt = doc.to_html

	return txt
	end

	def md_pass(txt)
	doc = Nokogiri::HTML(txt)
	txt = doc.at_css('body').inner_html.strip

	{
	'' => '\',
	'_' => '\_',
	"\n-" => "\n\-",
	/<\/?i>/ => '*',
	/<\/?b>/ => '__',
	'<p class="speaker">' => '#### ',
	'<p class="caption">' => '##### ',
	/<p[^>]*>/ => '',
	'</p>' => '',
	'(' => '_(',
	')' => ')_',
	'_(STATEMENT)_' => '(STATEMENT)',
	}.each { \|k,v\| txt.gsub!(k,v) }

	return txt
	end



	# HELPERS

	def excise_unstyled_spans(doc)
	doc = Nokogiri::HTML(doc) if doc.is_a?(String)
	doc.css('span:not([class])', 'span[class=""]').each do \|span\|
	span.add_previous_sibling(span.inner_html)
	span.remove
	end
	return doc
	end