Last active
January 17, 2016 22:52
-
-
Save sergeyf/6b0b52f79f057b746f9a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd | |
Modified to work on Windows by: Sergey Feldman | |
Jan 17, 2016 | |
Requirements: pdflatex, bibtex | |
''' | |
import requests | |
import lxml.html as html | |
import re | |
import os, os.path | |
import glob | |
import getpass | |
import urllib | |
import tarfile | |
from email.mime.application import MIMEApplication | |
from email.mime.multipart import MIMEMultipart | |
import smtplib | |
import shutil | |
# Fill in with your own info | |
query = 'http://arxiv.org/abs/1506.05908' | |
kindle_email = '[email protected]' | |
your_gmail = '[email protected]' | |
gmailpass = getpass.getpass() | |
temp_dir = '\\temp' # where the intermediate files are stored | |
# paper settings (decrease width/height to increase font) | |
landscape = True | |
width = "6in" | |
height = "4in" | |
margin = "0.1in" | |
# settings for latex geometry package: | |
if landscape: | |
geom_settings = dict(paperwidth=width, paperheight=height, margin=margin) | |
else: | |
geom_settings = dict(paperwidth=height, paperheight=width, margin=margin) | |
arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id') | |
arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id | |
arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id | |
arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0] | |
arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL) | |
arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL) | |
# make a temporary directory to store the tex files and download | |
# tar.gz of the source | |
d = temp_dir + '\\' + arxiv_id | |
os.mkdir(d) | |
cwd = os.getcwd() | |
os.chdir(d) | |
tar_fn = arxiv_id + '.tar.gz' | |
url = 'http://arxiv.org/e-print/' + arxiv_id | |
urllib.urlretrieve(url, tar_fn) | |
with tarfile.open(tar_fn, 'r:gz') as tar: | |
for item in tar: | |
tar.extract(item) | |
# find the files with .tex | |
# and get the main | |
texfiles = glob.glob(os.path.join(d, '*.tex')) | |
for texfile in texfiles: | |
with open(texfile, 'r') as f: | |
src = f.readlines() | |
if 'documentclass' in src[0]: | |
print('correct file: ' + texfile) | |
break | |
# filter comments/newlines for easier debugging: | |
src = [line for line in src if line[0] != '%' and len(line.strip()) > 0] | |
# strip font size, column stuff, and paper size stuff in documentclass line: | |
src[0] = re.sub(r'\b\d+pt\b', '', src[0]) | |
src[0] = re.sub(r'\b\w+column\b', '', src[0]) | |
src[0] = re.sub(r'\b\w+paper\b', '', src[0]) | |
src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas | |
src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas | |
# find begin{document}: | |
begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')] | |
assert(len(begindocs) == 1) | |
src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n') | |
src.insert(begindocs[0], '\\usepackage{times}\n') | |
src.insert(begindocs[0], '\\pagestyle{empty}\n') | |
src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex | |
if landscape: | |
src.insert(begindocs[0], '\\usepackage{pdflscape}\n') | |
# shrink figures to be at most the size of the page, now that it's landscape | |
for i in range(len(src)): | |
line = src[i] | |
m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line) | |
if m: | |
mul = m.group(1) | |
src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', | |
'\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul), | |
line) | |
# write updaetd tex | |
os.rename(texfile, texfile+'.bak') | |
with open(texfile, 'w') as f: | |
f.writelines(src) | |
# compile -> could hang here if the texfile is poorly written | |
os.system('pdflatex ' + texfile) | |
os.system('bibtex ' + texfile) | |
os.system('pdflatex ' + texfile) | |
os.system('pdflatex ' + texfile) | |
file_name = arxiv_title_scrubbed+".pdf" | |
os.rename(texfile[:-4]+'.pdf',file_name) | |
# send the email | |
msg = MIMEMultipart() | |
pdf = open(file_name, 'rb').read() | |
pdf_part = MIMEApplication(pdf, _subtype='pdf') | |
pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name) | |
msg.attach(pdf_part) | |
server = smtplib.SMTP('smtp.gmail.com:587') | |
server.starttls() | |
server.login(your_gmail, gmailpass) | |
server.sendmail(your_gmail, kindle_email, msg.as_string()) | |
server.close() | |
# clean up - delete the directory and its files | |
os.chdir(cwd) | |
shutil.rmtree(d) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment