Skip to content

Instantly share code, notes, and snippets.

@Snarp
Created April 23, 2020 00:56
Show Gist options
  • Save Snarp/794575bf3e85ad0b37f9feb83f145223 to your computer and use it in GitHub Desktop.
Save Snarp/794575bf3e85ad0b37f9feb83f145223 to your computer and use it in GitHub Desktop.
Converts GDocs files in a specific format to MD
require 'nokogiri'
# (Unless you are the person I am sending this to, you do not need it.)
#
# Steps to use:
# 1. Export a Google Docs file to HTML.
# 2. Unzip the HTML file and put the filename below.
# 3. Edit css_pass (further down) to correctly convert 'c{SOME NUMBER}'-class elements to <i>, <b>, or <u>.
# 4. Run cleanup()
@raw_fname ='FILENAME GOES HERE'
@work_fname = @raw_fname.sub('.html', '_WORK.html')
@html_fname = @raw_fname.sub('.html', '_EDIT.html')
@md_fname = @raw_fname.sub('.html', '.md')
@fname=@work_fname
def cleanup(in_fname=@raw_fname, out_fname=@md_fname)
first_pass(@raw_fname, @work_fname)
second_pass(@work_fname, @html_fname)
final_pass(@html_fname, @md_fname)
end
def first_pass(in_fname=@raw_fname, out_fname=@work_fname)
txt=readability_pass(File.read(in_fname))
File.write(out_fname, txt)
return txt
end
def second_pass(in_fname=@work_fname, out_fname=@html_fname)
txt = css_pass(File.read(in_fname))
txt = line_cleaning_pass(txt).strip
File.write(out_fname, txt)
return txt
end
def final_pass(in_fname=@html_fname, out_fname=@md_fname)
txt = md_pass(File.read(in_fname)).strip
File.write(out_fname, txt)
return txt
end
def readability_pass(txt)
{
/<body[^>]*>/ => "<body>\n",
"}" => "}\n",
"&nbsp;" => " ",
"\t" => " ",
"&Tab;" => " ",
"\r" => "\n",
"&NewLine;" => "\n",
/<\/?p>/ => "\n",
/<br\/?>/ => "\n",
"“" => "\"",
"”" => "\"",
"&ldquo;" => "\"",
"&rdquo;" => "\"",
"&quot;" => "\"",
"&QUOT;" => "\"",
"‘" => "'",
"’" => "'",
"&rsquo;" => "'",
"&lsquo;" => "'",
"&apos;" => "'",
"&#39;" => "'",
"—" => "--",
"&mdash;" => "--",
"&horbar;" => "--",
"&ndash;" => "--",
"&dash;" => "-",
"…" => "...",
"&hellip;" => "...",
"&mldr;" => "...",
"&nldr;" => "...",
"&amp;" => "&",
"&AMP;" => "&",
"&copy;" => "(c)",
"&reg;" => "(R)",
'<style type="text/css">' => "\n<style type=\"text/css\">\n",
'</style>' => "\n</style>\n",
/\n{3,}/ => "\n\n",
}.each { |k,v| txt.gsub!(k,v) }
return txt.strip
end
def css_pass(txt)
{
/c(4)/ => "italic",
/c(6)/ => "bold",
# /c(4)/ => "underline",
/c(\d+)/ => "",
/ class="\s*"/ => "",
}.each { |k,v| txt.gsub!(k,v) }
doc = Nokogiri::HTML(txt)
doc = excise_unstyled_spans(doc)
txt = doc.to_html
return txt
end
def line_cleaning_pass(txt)
{
"\n</p>" => "</p>",
/<p>\s*<\/p>/ => "{{BREAK}}",
}.each { |k,v| txt.gsub!(k,v) }
# indents = {
# / {0,0}/ => "caption", # (no leading whitespace)
# / {30,30}/ => "speaker",
# / {20,25}/ => "dialog",
# / {10,10}/ => "caption",
# }
# while m= /<p>(?<caption>\S[^<]*\s*)<\/p>/.match(txt)
while m= /<p>(?<caption>\S[^\n]*\s*)<\/p>/.match(txt)
txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
end
# while m= /<p> {30,30}(?<name>[^<]*\s*)<\/p>/.match(txt)
while m= /<p> {30,30}(?<name>[^\n]*\s*)<\/p>/.match(txt)
txt.gsub!(m[0], "<p class=\"speaker\">#{m[:name].strip}</p>\n{{BREAK}}")
end
# while m= /<p> {20,25}(?<dialog>[^<]*\s*)<\/p>/.match(txt)
while m= /<p> {20,25}(?<dialog>[^\n]*\s*)<\/p>/.match(txt)
txt.gsub!(m[0], "<p class=\"dialog\">#{m[:dialog].strip}</p>")
end
# while m= /<p> {10,10}(?<caption>[^<]*\s*)<\/p>/.match(txt)
while m= /<p> {10,10}(?<caption>[^\n]*\s*)<\/p>/.match(txt)
txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>")
end
doc = Nokogiri::HTML(txt)
doc.css('p').each { |p| p.inner_html=p.inner_html.strip }
doc.css('span[class]').each do |span|
classes = span['class'].strip.split(' ')
if classes==['italic']
span.add_previous_sibling("<i>#{span.inner_html}</i>")
span.remove
elsif [['bold'], ['underline']].include?(classes)
span.add_previous_sibling("<b>#{span.inner_html}</b>")
span.remove
elsif classes.count > 1
span.add_previous_sibling("<i><b>#{span.inner_html}</b></i>")
span.remove
end
end
txt = doc.to_html
{
/<\/i>\s+<i>/ => ' ',
/<\/b>\s+<b>/ => ' ',
'</i><i>' => '',
'</b><b>' => '',
/<\/p>\n<p[^>]*>/ => " ",
"\n{{BREAK}}" => "\n",
/\n{3,}/ => "\n\n",
}.each { |k,v| txt.gsub!(k,v) }
doc = Nokogiri::HTML(txt)
doc.css('p[class="caption"]').each do |p|
p.inner_html="[#{p.inner_html.strip}]"
end
txt = doc.to_html
return txt
end
def md_pass(txt)
doc = Nokogiri::HTML(txt)
txt = doc.at_css('body').inner_html.strip
{
'*' => '\*',
'_' => '\_',
"\n-" => "\n\-",
/<\/?i>/ => '*',
/<\/?b>/ => '__',
'<p class="speaker">' => '#### ',
'<p class="caption">' => '##### ',
/<p[^>]*>/ => '',
'</p>' => '',
'(' => '_(',
')' => ')_',
'_(STATEMENT)_' => '(STATEMENT)',
}.each { |k,v| txt.gsub!(k,v) }
return txt
end
# HELPERS
def excise_unstyled_spans(doc)
doc = Nokogiri::HTML(doc) if doc.is_a?(String)
doc.css('span:not([class])', 'span[class=""]').each do |span|
span.add_previous_sibling(span.inner_html)
span.remove
end
return doc
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment