Last active
July 15, 2017 02:02
-
-
Save z-a-f/7817a97393a19a0e2329c7d899fb7a72 to your computer and use it in GitHub Desktop.
RSS 2017 -- download all papers and presentations
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from lxml import html, etree | |
import requests | |
import urllib | |
import re | |
from tqdm import tqdm | |
import sys, os | |
page = requests.get('http://www.roboticsconference.org/program/detailed/index.html') | |
tree = html.fromstring(page.content) | |
search = "javascript:void($('#group" | |
names = [20,35] | |
found = None | |
author = None | |
unsafe = [' ', '*', '.', '"', '/', '\\', '[', ']', ':', ';', '|', '=', ','] | |
results = {} | |
for action, el in etree.iterwalk(tree, events=("start", "end")): | |
if not found and action == 'start' and el.tag == 'a' and el.get('href')[:len(search)] == search: | |
found = el.get('href')[names[0]:names[1]] | |
# title = '_'.join(el.text.split()) | |
title = '_'.join(re.findall(r"[\w']+", el.text)) | |
results[found] = { | |
'title': title | |
} | |
continue | |
if found and el.tag == 'a' and action == 'start' and el.text == "Full Paper": | |
results[found]['paper'] = el.get('href') | |
if found and el.tag == 'a' and action == 'start' and el.text == "Slides": | |
results[found]['slides'] = el.get('href') | |
found = None | |
dirname = './RSS_assets/' | |
try: | |
os.mkdir(dirname) | |
except OSError: | |
pass | |
for key, asset in tqdm(results.iteritems()): | |
try: | |
os.mkdir(dirname+key) | |
except OSError: | |
pass | |
paper_name = dirname+key+'/'+asset['title']+'.pdf' | |
slides_name = dirname+key+'/'+asset['title']+asset['slides'][-4:] | |
if not os.path.isfile(paper_name): | |
try: | |
urllib.urlretrieve(asset['paper'], paper_name) | |
except IOError: | |
print "could not load", paper_name | |
if not os.path.isfile(slides_name): | |
try: | |
urllib.urlretrieve(asset['slides'], slides_name) | |
except IOError: | |
print "could not load", slides_name |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment