-
-
Save pgbovine/7208714 to your computer and use it in GitHub Desktop.
Download a list of LaTeX files from Google Docs in parallel, and compile locally.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from HTMLParser import HTMLParser, HTMLParseError | |
from htmlentitydefs import name2codepoint | |
import re, json, sys, urllib2 | |
def main(): | |
if len(sys.argv) < 2: | |
print >>sys.stderr, """ | |
usage: python gdoc2latex.py <URL or .gdoc filename> | |
example: python gdoc2latex.py https://docs.google.com/document/d/1yEyXxtEeQ5_E7PibjYpofPC6kP4jMG-EieKhwkK7oQE/edit | |
example: python gdoc2latex.py test.gdoc | |
""" | |
html = fetchGoogleDoc(sys.argv[1]) | |
text = html_to_text(html) | |
latex = unicode_to_latex(text) | |
sys.stdout.write(latex) | |
def fetchGoogleDoc(urlOrGdocFile): | |
""" | |
Downloads a Google Doc identified either by a URL or by a local Google Drive .gdoc file | |
and returns its contents as a text file. | |
Requires the Google Doc to be readable by anyone with the link (Share, Anyone who has the link can view). | |
""" | |
# find the doc url | |
if urlOrGdocFile.startswith("https://"): | |
url = urlOrGdocFile | |
elif urlOrGdocFile.endswith(".gdoc"): | |
filename = urlOrGdocFile | |
f = open(filename, "r") | |
content = json.load(f) | |
f.close() | |
url = content["url"] | |
else: | |
raise Exception(str(urlOrGdocFile) + " not a google doc URL or .gdoc filename") | |
# pull out the document id | |
try: | |
docId = re.search("/document/d/([^/]+)/", url).group(1) | |
except Exception: | |
raise Exception("can't find a google document ID in " + str(urlOrGdocFile)) | |
# construct an export URL | |
exportUrl = "https://docs.google.com/document/d/" + docId + "/export?format=html" | |
# open a connection to it | |
conn = urllib2.urlopen(exportUrl) | |
if "ServiceLogin" in conn.geturl(): # we were redirected to a login -- doc isn't publicly viewable | |
raise Exception(""" | |
The google doc | |
{url} | |
is not publicly readable. It needs to be publicly | |
readable in order for this script to work. | |
To fix this, visit the doc in your web browser, | |
and use Share >> Change... >> Anyone with Link >> can view. | |
""".format(url = urlOrGdocFile)) | |
# download the html | |
raw = conn.read() | |
encoding = conn.headers['content-type'].split('charset=')[-1] | |
html = unicode(raw, encoding) | |
conn.close() | |
return html | |
def html_to_text(html): | |
""" | |
Given a piece of HTML, return the plain text it contains, as a unicode string. | |
Throws away: | |
- text from the <head> element | |
- text in <style> and <script> elements | |
- text in Google Doc sidebar comments | |
Also translates entities and char refs into unicode characters. | |
""" | |
parser = _HTMLToText() | |
try: | |
parser.feed(html) | |
parser.close() | |
except HTMLParseError: | |
pass | |
return parser.get_text() | |
class _HTMLToText(HTMLParser): | |
""" | |
HTMLParser subclass that finds all the text in an html doc. | |
Used by html_to_text. | |
""" | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self._buf = [] | |
self.hide_output_nesting_level = 0 | |
def handle_starttag(self, tag, attrs): | |
attrsDict = self.to_dict(attrs) | |
if tag in ['script', 'style', 'head']: | |
self.hide_output_nesting_level = 1 | |
elif tag == "a" and "name" in attrsDict and attrsDict["name"].startswith("cmnt_"): | |
# found a Google Doc comment -- remove it | |
self.hide_output_nesting_level = 1 | |
elif self.hide_output_nesting_level > 0: | |
self.hide_output_nesting_level += 1 | |
if tag in ('p', 'br') and not self.at_start_of_line(): | |
self.append('\n') | |
def handle_startendtag(self, tag, attrs): | |
if tag == 'br': | |
self.append('\n') | |
def handle_endtag(self, tag): | |
if tag == 'p': | |
self.append('\n') | |
if self.hide_output_nesting_level > 0: | |
self.hide_output_nesting_level -= 1 | |
def handle_data(self, text): | |
if text: | |
self.append(re.sub(r'\s+', ' ', text)) | |
def handle_entityref(self, name): | |
if name in name2codepoint: | |
c = unichr(name2codepoint[name]) | |
self.append(c) | |
def handle_charref(self, name): | |
n = int(name[1:], 16) if name.startswith('x') else int(name) | |
self.append(unichr(n)) | |
def append(self, str): | |
if self.hide_output_nesting_level == 0: | |
self._buf.append(str) | |
def at_start_of_line(self): | |
return len(self._buf) == 0 or self._buf[-1][-1] == '\n' | |
def to_dict(self,attrs): | |
dict = {} | |
for (name,val) in attrs: | |
dict[name] = val | |
return dict | |
def get_text(self): | |
return re.sub(r' +', ' ', ''.join(self._buf)) | |
def unicode_to_latex(text): | |
""" | |
Converts unicode into Latex format: | |
primarily utf8, with some special characters converted to Latex syntax | |
""" | |
tr = [ | |
(u'\u2013', "--"), | |
(u'\u2014', "---"), | |
(u'\u2018', "`"), | |
(u'\u2019', "'"), | |
(u'\u201c', "``"), | |
(u'\u201d', "''"), | |
(u'\xa0', ' '), # no-break space | |
] | |
for a, b in tr: | |
text = text.replace(a, b) | |
return text.encode("utf8") | |
def download_to_file(gdoc_url, out_filename): | |
html = fetchGoogleDoc(gdoc_url) | |
text = html_to_text(html) | |
latex = unicode_to_latex(text) | |
with open(out_filename, 'w') as f: | |
f.write(latex) | |
print 'Wrote', gdoc_url, 'to', out_filename | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all: | |
python parallel_download_gdocs.py | |
bibtex paper | |
pdflatex paper.tex | |
bib: | |
python parallel_download_gdocs.py | |
pdflatex paper.tex | |
bibtex paper | |
pdflatex paper.tex | |
pdflatex paper.tex |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gdoc2latex import download_to_file | |
from multiprocessing import Process | |
# downloads N GDocs files in parallel and then terminates only when all | |
# files have been downloaded | |
if __name__ == '__main__': | |
# TODO: fill this in with your own strings: | |
args = [ | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
(<GOOGLE DOCS URL>, <OUTPUT FILENAME>), | |
] | |
for tup in args: | |
Process(target=download_to_file, args=tup).start() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment