Skip to content

Instantly share code, notes, and snippets.

@thomasweng15
Created August 25, 2018 21:57
Show Gist options
  • Save thomasweng15/3bc8f0c978ccfe7295fd3d1079c26267 to your computer and use it in GitHub Desktop.
Save thomasweng15/3bc8f0c978ccfe7295fd3d1079c26267 to your computer and use it in GitHub Desktop.
R:SS web scrape
from bs4 import BeautifulSoup
import requests
def save_pdf(base_url, year, href):
chunk_size = 2000
pdf_url = base_url + href
pdf_req = requests.get(pdf_url, stream=True)
fname = year + "_" + href
with open("./data/%s" % fname, 'wb') as fd:
for chunk in pdf_req.iter_content(chunk_size):
fd.write(chunk)
years = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14"]
for year in years:
base_url = "http://www.roboticsproceedings.org/rss%s/" % year
r = requests.get(base_url + "index.html")
data = r.text
soup = BeautifulSoup(data, features="html.parser")
for link in soup.find_all('a'):
href = link.get("href")
if ".pdf" in href:
save_pdf(base_url, year, href)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment