Created
April 23, 2020 00:56
-
-
Save Snarp/794575bf3e85ad0b37f9feb83f145223 to your computer and use it in GitHub Desktop.
Converts GDocs files in a specific format to MD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
# (Unless you are the person I am sending this to, you do not need it.) | |
# | |
# Steps to use: | |
# 1. Export a Google Docs file to HTML. | |
# 2. Unzip the HTML file and put the filename below. | |
# 3. Edit css_pass (further down) to correctly convert 'c{SOME NUMBER}'-class elements to <i>, <b>, or <u>. | |
# 4. Run cleanup() | |
@raw_fname ='FILENAME GOES HERE' | |
@work_fname = @raw_fname.sub('.html', '_WORK.html') | |
@html_fname = @raw_fname.sub('.html', '_EDIT.html') | |
@md_fname = @raw_fname.sub('.html', '.md') | |
@fname=@work_fname | |
def cleanup(in_fname=@raw_fname, out_fname=@md_fname) | |
first_pass(@raw_fname, @work_fname) | |
second_pass(@work_fname, @html_fname) | |
final_pass(@html_fname, @md_fname) | |
end | |
def first_pass(in_fname=@raw_fname, out_fname=@work_fname) | |
txt=readability_pass(File.read(in_fname)) | |
File.write(out_fname, txt) | |
return txt | |
end | |
def second_pass(in_fname=@work_fname, out_fname=@html_fname) | |
txt = css_pass(File.read(in_fname)) | |
txt = line_cleaning_pass(txt).strip | |
File.write(out_fname, txt) | |
return txt | |
end | |
def final_pass(in_fname=@html_fname, out_fname=@md_fname) | |
txt = md_pass(File.read(in_fname)).strip | |
File.write(out_fname, txt) | |
return txt | |
end | |
def readability_pass(txt) | |
{ | |
/<body[^>]*>/ => "<body>\n", | |
"}" => "}\n", | |
" " => " ", | |
"\t" => " ", | |
"	" => " ", | |
"\r" => "\n", | |
"
" => "\n", | |
/<\/?p>/ => "\n", | |
/<br\/?>/ => "\n", | |
"“" => "\"", | |
"”" => "\"", | |
"“" => "\"", | |
"”" => "\"", | |
""" => "\"", | |
""" => "\"", | |
"‘" => "'", | |
"’" => "'", | |
"’" => "'", | |
"‘" => "'", | |
"'" => "'", | |
"'" => "'", | |
"—" => "--", | |
"—" => "--", | |
"―" => "--", | |
"–" => "--", | |
"‐" => "-", | |
"…" => "...", | |
"…" => "...", | |
"…" => "...", | |
"‥" => "...", | |
"&" => "&", | |
"&" => "&", | |
"©" => "(c)", | |
"®" => "(R)", | |
'<style type="text/css">' => "\n<style type=\"text/css\">\n", | |
'</style>' => "\n</style>\n", | |
/\n{3,}/ => "\n\n", | |
}.each { |k,v| txt.gsub!(k,v) } | |
return txt.strip | |
end | |
def css_pass(txt) | |
{ | |
/c(4)/ => "italic", | |
/c(6)/ => "bold", | |
# /c(4)/ => "underline", | |
/c(\d+)/ => "", | |
/ class="\s*"/ => "", | |
}.each { |k,v| txt.gsub!(k,v) } | |
doc = Nokogiri::HTML(txt) | |
doc = excise_unstyled_spans(doc) | |
txt = doc.to_html | |
return txt | |
end | |
def line_cleaning_pass(txt) | |
{ | |
"\n</p>" => "</p>", | |
/<p>\s*<\/p>/ => "{{BREAK}}", | |
}.each { |k,v| txt.gsub!(k,v) } | |
# indents = { | |
# / {0,0}/ => "caption", # (no leading whitespace) | |
# / {30,30}/ => "speaker", | |
# / {20,25}/ => "dialog", | |
# / {10,10}/ => "caption", | |
# } | |
# while m= /<p>(?<caption>\S[^<]*\s*)<\/p>/.match(txt) | |
while m= /<p>(?<caption>\S[^\n]*\s*)<\/p>/.match(txt) | |
txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>") | |
end | |
# while m= /<p> {30,30}(?<name>[^<]*\s*)<\/p>/.match(txt) | |
while m= /<p> {30,30}(?<name>[^\n]*\s*)<\/p>/.match(txt) | |
txt.gsub!(m[0], "<p class=\"speaker\">#{m[:name].strip}</p>\n{{BREAK}}") | |
end | |
# while m= /<p> {20,25}(?<dialog>[^<]*\s*)<\/p>/.match(txt) | |
while m= /<p> {20,25}(?<dialog>[^\n]*\s*)<\/p>/.match(txt) | |
txt.gsub!(m[0], "<p class=\"dialog\">#{m[:dialog].strip}</p>") | |
end | |
# while m= /<p> {10,10}(?<caption>[^<]*\s*)<\/p>/.match(txt) | |
while m= /<p> {10,10}(?<caption>[^\n]*\s*)<\/p>/.match(txt) | |
txt.gsub!(m[0], "<p class=\"caption\">#{m[:caption].strip}</p>") | |
end | |
doc = Nokogiri::HTML(txt) | |
doc.css('p').each { |p| p.inner_html=p.inner_html.strip } | |
doc.css('span[class]').each do |span| | |
classes = span['class'].strip.split(' ') | |
if classes==['italic'] | |
span.add_previous_sibling("<i>#{span.inner_html}</i>") | |
span.remove | |
elsif [['bold'], ['underline']].include?(classes) | |
span.add_previous_sibling("<b>#{span.inner_html}</b>") | |
span.remove | |
elsif classes.count > 1 | |
span.add_previous_sibling("<i><b>#{span.inner_html}</b></i>") | |
span.remove | |
end | |
end | |
txt = doc.to_html | |
{ | |
/<\/i>\s+<i>/ => ' ', | |
/<\/b>\s+<b>/ => ' ', | |
'</i><i>' => '', | |
'</b><b>' => '', | |
/<\/p>\n<p[^>]*>/ => " ", | |
"\n{{BREAK}}" => "\n", | |
/\n{3,}/ => "\n\n", | |
}.each { |k,v| txt.gsub!(k,v) } | |
doc = Nokogiri::HTML(txt) | |
doc.css('p[class="caption"]').each do |p| | |
p.inner_html="[#{p.inner_html.strip}]" | |
end | |
txt = doc.to_html | |
return txt | |
end | |
def md_pass(txt) | |
doc = Nokogiri::HTML(txt) | |
txt = doc.at_css('body').inner_html.strip | |
{ | |
'*' => '\*', | |
'_' => '\_', | |
"\n-" => "\n\-", | |
/<\/?i>/ => '*', | |
/<\/?b>/ => '__', | |
'<p class="speaker">' => '#### ', | |
'<p class="caption">' => '##### ', | |
/<p[^>]*>/ => '', | |
'</p>' => '', | |
'(' => '_(', | |
')' => ')_', | |
'_(STATEMENT)_' => '(STATEMENT)', | |
}.each { |k,v| txt.gsub!(k,v) } | |
return txt | |
end | |
# HELPERS | |
def excise_unstyled_spans(doc) | |
doc = Nokogiri::HTML(doc) if doc.is_a?(String) | |
doc.css('span:not([class])', 'span[class=""]').each do |span| | |
span.add_previous_sibling(span.inner_html) | |
span.remove | |
end | |
return doc | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment