Created
January 12, 2021 18:14
-
-
Save GrayXu/7498f2406384772bee909cfb24554c99 to your computer and use it in GitHub Desktop.
asplos '21 abstracts pdf crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install bs4 request | |
# python3 asplos21.py | |
import requests | |
from bs4 import BeautifulSoup | |
import os | |
from multiprocessing.dummy import Pool as ThreadPool | |
import sys | |
pool = ThreadPool(10) | |
links = [] | |
r = requests.get("https://asplos-conference.org/abstracts/", timeout=5) | |
soup = BeautifulSoup(r.text, features="lxml") | |
data = soup.find_all(name='a') | |
for i in data: | |
if i['href'].endswith(".pdf"): | |
links.append(i['href']) | |
def download(i): | |
url = "https://asplos-conference.org"+links[i] | |
fname = links[i].split("/")[-1] | |
if not os.path.exists(fname): | |
r = requests.get(url, timeout=5) | |
if r.status_code != 200: | |
print("download error!") | |
else: | |
with open('./'+fname, 'wb+') as f: | |
f.write(r.content) | |
print(fname) | |
try: | |
results = pool.map(download, [x for x in range(len(links))]) | |
except Exception as e: | |
print(">"*50, e) | |
raise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment