Skip to content

Instantly share code, notes, and snippets.

@sergeyf
Last active January 17, 2016 22:52
Show Gist options
  • Save sergeyf/6b0b52f79f057b746f9a to your computer and use it in GitHub Desktop.
Save sergeyf/6b0b52f79f057b746f9a to your computer and use it in GitHub Desktop.
'''
Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd
Modified to work on Windows by: Sergey Feldman
Jan 17, 2016
Requirements: pdflatex, bibtex
'''
import requests
import lxml.html as html
import re
import os, os.path
import glob
import getpass
import urllib
import tarfile
from email.mime.application import MIMEApplication
from email.mime.multipart import MIMEMultipart
import smtplib
import shutil
# Fill in with your own info
query = 'http://arxiv.org/abs/1506.05908'
kindle_email = '[email protected]'
your_gmail = '[email protected]'
gmailpass = getpass.getpass()
temp_dir = '\\temp' # where the intermediate files are stored
# paper settings (decrease width/height to increase font)
landscape = True
width = "6in"
height = "4in"
margin = "0.1in"
# settings for latex geometry package:
if landscape:
geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
else:
geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)
arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)
# make a temporary directory to store the tex files and download
# tar.gz of the source
d = temp_dir + '\\' + arxiv_id
os.mkdir(d)
cwd = os.getcwd()
os.chdir(d)
tar_fn = arxiv_id + '.tar.gz'
url = 'http://arxiv.org/e-print/' + arxiv_id
urllib.urlretrieve(url, tar_fn)
with tarfile.open(tar_fn, 'r:gz') as tar:
for item in tar:
tar.extract(item)
# find the files with .tex
# and get the main
texfiles = glob.glob(os.path.join(d, '*.tex'))
for texfile in texfiles:
with open(texfile, 'r') as f:
src = f.readlines()
if 'documentclass' in src[0]:
print('correct file: ' + texfile)
break
# filter comments/newlines for easier debugging:
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]
# strip font size, column stuff, and paper size stuff in documentclass line:
src[0] = re.sub(r'\b\d+pt\b', '', src[0])
src[0] = re.sub(r'\b\w+column\b', '', src[0])
src[0] = re.sub(r'\b\w+paper\b', '', src[0])
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas
# find begin{document}:
begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
assert(len(begindocs) == 1)
src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
src.insert(begindocs[0], '\\usepackage{times}\n')
src.insert(begindocs[0], '\\pagestyle{empty}\n')
src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
if landscape:
src.insert(begindocs[0], '\\usepackage{pdflscape}\n')
# shrink figures to be at most the size of the page, now that it's landscape
for i in range(len(src)):
line = src[i]
m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
if m:
mul = m.group(1)
src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
'\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
line)
# write updaetd tex
os.rename(texfile, texfile+'.bak')
with open(texfile, 'w') as f:
f.writelines(src)
# compile -> could hang here if the texfile is poorly written
os.system('pdflatex ' + texfile)
os.system('bibtex ' + texfile)
os.system('pdflatex ' + texfile)
os.system('pdflatex ' + texfile)
file_name = arxiv_title_scrubbed+".pdf"
os.rename(texfile[:-4]+'.pdf',file_name)
# send the email
msg = MIMEMultipart()
pdf = open(file_name, 'rb').read()
pdf_part = MIMEApplication(pdf, _subtype='pdf')
pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
msg.attach(pdf_part)
server = smtplib.SMTP('smtp.gmail.com:587')
server.starttls()
server.login(your_gmail, gmailpass)
server.sendmail(your_gmail, kindle_email, msg.as_string())
server.close()
# clean up - delete the directory and its files
os.chdir(cwd)
shutil.rmtree(d)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment