Skip to content

Instantly share code, notes, and snippets.

@GrayXu
Created January 12, 2021 18:14
Show Gist options
  • Save GrayXu/7498f2406384772bee909cfb24554c99 to your computer and use it in GitHub Desktop.
Save GrayXu/7498f2406384772bee909cfb24554c99 to your computer and use it in GitHub Desktop.
asplos '21 abstracts pdf crawler
# pip install bs4 request
# python3 asplos21.py
import requests
from bs4 import BeautifulSoup
import os
from multiprocessing.dummy import Pool as ThreadPool
import sys
pool = ThreadPool(10)
links = []
r = requests.get("https://asplos-conference.org/abstracts/", timeout=5)
soup = BeautifulSoup(r.text, features="lxml")
data = soup.find_all(name='a')
for i in data:
if i['href'].endswith(".pdf"):
links.append(i['href'])
def download(i):
url = "https://asplos-conference.org"+links[i]
fname = links[i].split("/")[-1]
if not os.path.exists(fname):
r = requests.get(url, timeout=5)
if r.status_code != 200:
print("download error!")
else:
with open('./'+fname, 'wb+') as f:
f.write(r.content)
print(fname)
try:
results = pool.map(download, [x for x in range(len(links))])
except Exception as e:
print(">"*50, e)
raise
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment