Skip to content

Instantly share code, notes, and snippets.

@hpcdisrespecter
Created February 3, 2025 23:17
Show Gist options
  • Select an option

  • Save hpcdisrespecter/9d5c3b25c2919ec507435684a8429caa to your computer and use it in GitHub Desktop.

Select an option

Save hpcdisrespecter/9d5c3b25c2919ec507435684a8429caa to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
def download_req(url, name):
with requests.get(url) as r:
# make sure the file handile is not already in existence
if os.path.exists(name):
prefix = name.split(".")[0]
suffix = name.split(".")[1]
n = 1
while os.path.exists(f"{prefix}_{n}.{suffix}"):
n += 1
name = f"{prefix}_{n}.{suffix}"
with open(f"{name}", "wb") as f:
f.write(r.content)
print(f"Downloaded {name}")
# Function to download the PDF
def download_pdf(doi):
# Construct the URL for the DOI
base_url = "https://sci-hub.se/" # Change to the actual site URL
full_url = f"{base_url}{doi}"
# Set up Selenium
options = webdriver.ChromeOptions()
options.add_argument("--headless=new");
driver = webdriver.Chrome(options=options)
try:
# Open the page
driver.get(full_url)
time.sleep(3)
# did we get a webpage?
if "sci-hub" not in driver.current_url:
print(f"Invalid DOI: {doi}")
return
# Get page source and create a BeautifulSoup object
soup = BeautifulSoup(driver.page_source, 'html.parser')
# Find all buttons that match the criteria
buttons = soup.find_all('button', onclick=True)
# Loop through the buttons and click them
for button in buttons:
button_text = button.text.strip()
if 'save' in button_text.lower(): # Modify this condition based on the button text you're targeting
# Find the corresponding button element in the Selenium context
button_element = driver.find_element(By.XPATH, f"//button[contains(text(), '{button_text}')]")
# parse the onclick attribute to get the URL
onclick = button['onclick']
pdf_url = onclick.split("'")[1]
# if present, remove the '?download=true' from the URL:
if "?download=true" in pdf_url:
pdf_url = pdf_url.replace("?download=true", "")
# does the URL start with '//'? If so, add 'https:' to the beginning
if pdf_url.startswith("//"):
pdf_url = f"https:{pdf_url}"
else:
pdf_url = f"{base_url}{pdf_url}"
print(f"Downloading from {pdf_url}")
# use the last portion of the URL as the filename
pdf_name = pdf_url.split("/")[-1]
print(f"Saving as {pdf_name}")
# Download the PDF
finished = False
while not finished:
try:
download_req(pdf_url, pdf_name)
finished = True
except Exception as e:
print(f"An error occurred: {e}")
time.sleep(3)
# did anything go wrong?
if os.path.getsize(pdf_name) < 1000:
print(f"Failed to download {pdf_name}")
os.remove(pdf_name)
else:
print(f"Downloaded {pdf_name}")
break
except Exception as e:
print(f"An error occurred: {e}")
finally:
driver.quit()
# Example usage
with open("dois.txt", "r") as doi_list:
dois = doi_list.readlines()
for doi in dois:
download_pdf(doi)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment