sergeyf · January 17, 2016 22:52
diff --git a/arxiv2kindle.py b/arxiv2kindle.py
 '''
 Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd
 Modified to work on Windows by: Sergey Feldman
 Jan 17, 2016

 Requirements: pdflatex, bibtex
 '''

 import requests
 import lxml.html as html
 import re
 import os, os.path
 import glob
 import getpass
 import urllib
 import tarfile
 from email.mime.application import MIMEApplication
 from email.mime.multipart import MIMEMultipart
 import smtplib
 import shutil

 # Fill in with your own info
 query = 'http://arxiv.org/abs/1506.05908'
 kindle_email = '[email protected]'
 your_gmail = '[email protected]'
 gmailpass = getpass.getpass()
 temp_dir = '\\temp' # where the intermediate files are stored

 # paper settings (decrease width/height to increase font)
 landscape = True
 width = "6in"
 height = "4in"
 margin = "0.1in"
 # settings for latex geometry package:
 if landscape:
    geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
 else:
    geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)
    
 arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
 arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
 arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
 arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
 arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
 arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

 # make a temporary directory to store the tex files and download 
 # tar.gz of the source
 d = temp_dir + '\\' + arxiv_id
 os.mkdir(d)
 cwd = os.getcwd()
 os.chdir(d)

 tar_fn = arxiv_id + '.tar.gz'
 url = 'http://arxiv.org/e-print/' + arxiv_id
 urllib.urlretrieve(url, tar_fn)
 with tarfile.open(tar_fn, 'r:gz') as tar:
    for item in tar:
        tar.extract(item)

 # find the files with .tex 
 # and get the main
 texfiles = glob.glob(os.path.join(d, '*.tex'))
 for texfile in texfiles:
    with open(texfile, 'r') as f:
        src = f.readlines()
    if 'documentclass' in src[0]:
        print('correct file: ' + texfile)
        break
    
 # filter comments/newlines for easier debugging:
 src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

 # strip font size, column stuff, and paper size stuff in documentclass line:
 src[0] = re.sub(r'\b\d+pt\b', '', src[0])
 src[0] = re.sub(r'\b\w+column\b', '', src[0])
 src[0] = re.sub(r'\b\w+paper\b', '', src[0])
 src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
 src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

 # find begin{document}:
 begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
 assert(len(begindocs) == 1)
 src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
 src.insert(begindocs[0], '\\usepackage{times}\n')
 src.insert(begindocs[0], '\\pagestyle{empty}\n')
 src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
 if landscape:
    src.insert(begindocs[0], '\\usepackage{pdflscape}\n')

 # shrink figures to be at most the size of the page, now that it's landscape
 for i in range(len(src)):
    line = src[i]
    m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]', line)
    if m:
        mul = m.group(1)
        src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line|text)width\]',
                   '\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
                   line)

 # write updaetd tex
 os.rename(texfile, texfile+'.bak')
 with open(texfile, 'w') as f:
    f.writelines(src)

 # compile -> could hang here if the texfile is poorly written
 os.system('pdflatex ' + texfile)
 os.system('bibtex ' + texfile)
 os.system('pdflatex ' + texfile)
 os.system('pdflatex ' + texfile)
 file_name = arxiv_title_scrubbed+".pdf"
 os.rename(texfile[:-4]+'.pdf',file_name)

 # send the email
 msg = MIMEMultipart()
 pdf = open(file_name, 'rb').read()
 pdf_part = MIMEApplication(pdf, _subtype='pdf')
 pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
 msg.attach(pdf_part)

 server = smtplib.SMTP('smtp.gmail.com:587')  
 server.starttls()  
 server.login(your_gmail, gmailpass)
 server.sendmail(your_gmail, kindle_email, msg.as_string())
 server.close()

 # clean up - delete the directory and its files
 os.chdir(cwd)
 shutil.rmtree(d)
	'''
	Based on: https://gist.github.com/bshillingford/6259986edca707ca58dd
	Modified to work on Windows by: Sergey Feldman
	Jan 17, 2016

	Requirements: pdflatex, bibtex
	'''

	import requests
	import lxml.html as html
	import re
	import os, os.path
	import glob
	import getpass
	import urllib
	import tarfile
	from email.mime.application import MIMEApplication
	from email.mime.multipart import MIMEMultipart
	import smtplib
	import shutil

	# Fill in with your own info
	query = 'http://arxiv.org/abs/1506.05908'
	kindle_email = '[email protected]'
	your_gmail = '[email protected]'
	gmailpass = getpass.getpass()
	temp_dir = '\\temp' # where the intermediate files are stored

	# paper settings (decrease width/height to increase font)
	landscape = True
	width = "6in"
	height = "4in"
	margin = "0.1in"
	# settings for latex geometry package:
	if landscape:
	geom_settings = dict(paperwidth=width, paperheight=height, margin=margin)
	else:
	geom_settings = dict(paperwidth=height, paperheight=width, margin=margin)

	arxiv_id = re.match(r'(http://.*?/)?(?P<id>\d{4}\.\d{4,5}(v\d{1,2})?)', query).group('id')
	arxiv_abs = 'http://arxiv.org/abs/' + arxiv_id
	arxiv_pdf = 'http://arxiv.org/pdf/' + arxiv_id
	arxiv_pgtitle = html.fromstring(requests.get(arxiv_abs).text.encode('utf8')).xpath('/html/head/title/text()')[0]
	arxiv_title = re.sub(r'\s+', ' ', re.sub(r'^\[[^]]+\]\s*', '', arxiv_pgtitle), re.DOTALL)
	arxiv_title_scrubbed = re.sub('[^-_A-Za-z0-9]+', '_', arxiv_title, re.DOTALL)

	# make a temporary directory to store the tex files and download
	# tar.gz of the source
	d = temp_dir + '\\' + arxiv_id
	os.mkdir(d)
	cwd = os.getcwd()
	os.chdir(d)

	tar_fn = arxiv_id + '.tar.gz'
	url = 'http://arxiv.org/e-print/' + arxiv_id
	urllib.urlretrieve(url, tar_fn)
	with tarfile.open(tar_fn, 'r:gz') as tar:
	for item in tar:
	tar.extract(item)

	# find the files with .tex
	# and get the main
	texfiles = glob.glob(os.path.join(d, '*.tex'))
	for texfile in texfiles:
	with open(texfile, 'r') as f:
	src = f.readlines()
	if 'documentclass' in src[0]:
	print('correct file: ' + texfile)
	break

	# filter comments/newlines for easier debugging:
	src = [line for line in src if line[0] != '%' and len(line.strip()) > 0]

	# strip font size, column stuff, and paper size stuff in documentclass line:
	src[0] = re.sub(r'\b\d+pt\b', '', src[0])
	src[0] = re.sub(r'\b\w+column\b', '', src[0])
	src[0] = re.sub(r'\b\w+paper\b', '', src[0])
	src[0] = re.sub(r'(?<=\[),', '', src[0]) # remove extraneous starting commas
	src[0] = re.sub(r',(?=[\],])', '', src[0]) # remove extraneous middle/ending commas

	# find begin{document}:
	begindocs = [i for i, line in enumerate(src) if line.startswith(r'\begin{document}')]
	assert(len(begindocs) == 1)
	src.insert(begindocs[0], '\\usepackage['+','.join(k+'='+v for k,v in geom_settings.items())+']{geometry}\n')
	src.insert(begindocs[0], '\\usepackage{times}\n')
	src.insert(begindocs[0], '\\pagestyle{empty}\n')
	src.insert(begindocs[0], '\\usepackage{epstopdf}\n') # so eps will work with pdflatex
	if landscape:
	src.insert(begindocs[0], '\\usepackage{pdflscape}\n')

	# shrink figures to be at most the size of the page, now that it's landscape
	for i in range(len(src)):
	line = src[i]
	m = re.search(r'\\includegraphics\[width=([.\d]+)\\(line\|text)width\]', line)
	if m:
	mul = m.group(1)
	src[i] = re.sub(r'\\includegraphics\[width=([.\d]+)\\(line\|text)width\]',
	'\\includegraphics[width={mul}\\\\textwidth,height={mul}\\\\textheight,keepaspectratio]'.format(mul=mul),
	line)

	# write updaetd tex
	os.rename(texfile, texfile+'.bak')
	with open(texfile, 'w') as f:
	f.writelines(src)

	# compile -> could hang here if the texfile is poorly written
	os.system('pdflatex ' + texfile)
	os.system('bibtex ' + texfile)
	os.system('pdflatex ' + texfile)
	os.system('pdflatex ' + texfile)
	file_name = arxiv_title_scrubbed+".pdf"
	os.rename(texfile[:-4]+'.pdf',file_name)

	# send the email
	msg = MIMEMultipart()
	pdf = open(file_name, 'rb').read()
	pdf_part = MIMEApplication(pdf, _subtype='pdf')
	pdf_part.add_header('Content-Disposition', 'attachment', filename=file_name)
	msg.attach(pdf_part)

	server = smtplib.SMTP('smtp.gmail.com:587')
	server.starttls()
	server.login(your_gmail, gmailpass)
	server.sendmail(your_gmail, kindle_email, msg.as_string())
	server.close()

	# clean up - delete the directory and its files
	os.chdir(cwd)
	shutil.rmtree(d)
No results found