docker build --rm --force-rm -t r.j3ss.co/scrape .
docker run --rm -it -v $(pwd)/results:/root/papers r.j3ss.co/scrape
Created
December 28, 2018 22:54
-
-
Save jessfraz/ca9fcb5f36582862884d1641ba88f4c0 to your computer and use it in GitHub Desktop.
Scrape best papers site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM python:2-alpine | |
RUN pip install \ | |
beautifulsoup4 \ | |
requests | |
COPY papers.py /usr/local/bin/ | |
RUN chmod +x /usr/local/bin/papers.py | |
WORKDIR /root | |
CMD ["papers.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# Import python libraries. | |
import hashlib | |
import os | |
import random | |
import signal | |
import sys | |
from time import sleep | |
# Import external deps. | |
from bs4 import BeautifulSoup | |
import requests | |
# Handle control+C. | |
def signal_handler(sig, frame): | |
print('You pressed Ctrl+C! Exiting...') | |
sys.exit(0) | |
def get_google_scholar_pdf(page): | |
headers={ | |
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0', | |
} | |
page = requests.get(page, headers=headers) | |
print("page content: " + page.content) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
pdf = "" | |
# Find the link with the pdf. | |
for link in soup.find_all('a'): | |
if link is not None: | |
href = link['href'] | |
print("href: " + href) | |
if href.endswith(".pdf"): | |
return pdf | |
return pdf | |
def main(page): | |
# Make the parent directory. | |
parentDir = "papers" | |
if not os.path.exists(parentDir): | |
os.makedirs(parentDir) | |
page = requests.get(page) | |
soup = BeautifulSoup(page.content, 'html.parser') | |
# Find all the table rows with content. | |
# Skip the first header row. | |
for row in soup.find_all('table')[0].find_all('tr')[1:]: | |
links = row.select('td a') | |
if links is not None: | |
link = links[0] | |
text = link.text | |
href = link['href'] | |
# Follow the href to get the PDF. | |
pdf = get_google_scholar_pdf(href) | |
print("pdf: " + pdf) | |
# Sleep so it does not think we are a bot. | |
sleep(1) | |
signal.signal(signal.SIGINT, signal_handler) | |
main("https://jeffhuang.com/best_paper_awards.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment