hpcdisrespecter · February 3, 2025 23:17
diff --git a/paper_scraper.py b/paper_scraper.py
 import requests
 from bs4 import BeautifulSoup
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 import time
 import os

 def download_req(url, name):
    with requests.get(url) as r:
        # make sure the file handile is not already in existence
        if os.path.exists(name):
            prefix = name.split(".")[0]
            suffix = name.split(".")[1]
            n = 1 
            while os.path.exists(f"{prefix}_{n}.{suffix}"):
                n += 1
            name = f"{prefix}_{n}.{suffix}"
        with open(f"{name}", "wb") as f:
            f.write(r.content)
            print(f"Downloaded {name}")

 # Function to download the PDF
 def download_pdf(doi):
    # Construct the URL for the DOI
    base_url = "https://sci-hub.se/"  # Change to the actual site URL
    full_url = f"{base_url}{doi}"
    
    # Set up Selenium
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new");
    driver = webdriver.Chrome(options=options) 

    try:
        # Open the page
        driver.get(full_url)
        time.sleep(3)
        # did we get a webpage? 
        if "sci-hub" not in driver.current_url:
            print(f"Invalid DOI: {doi}")
            return
        
        # Get page source and create a BeautifulSoup object
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        # Find all buttons that match the criteria
        buttons = soup.find_all('button', onclick=True)
        # Loop through the buttons and click them
        for button in buttons:
            button_text = button.text.strip()
            if 'save' in button_text.lower():  # Modify this condition based on the button text you're targeting
                # Find the corresponding button element in the Selenium context
                button_element = driver.find_element(By.XPATH, f"//button[contains(text(), '{button_text}')]")
                # parse the onclick attribute to get the URL
                onclick = button['onclick']
                pdf_url = onclick.split("'")[1]
                # if present, remove the '?download=true' from the URL:
                if "?download=true" in pdf_url:
                    pdf_url = pdf_url.replace("?download=true", "")
                # does the URL start with '//'? If so, add 'https:' to the beginning 
                if pdf_url.startswith("//"):
                    pdf_url = f"https:{pdf_url}"
                else:
                    pdf_url = f"{base_url}{pdf_url}"
                print(f"Downloading from {pdf_url}")
                # use the last portion of the URL as the filename
                pdf_name = pdf_url.split("/")[-1]
                print(f"Saving as {pdf_name}")
                # Download the PDF
                finished = False
                while not finished:
                    try:
                        download_req(pdf_url, pdf_name)
                        finished = True
                    except Exception as e:
                        print(f"An error occurred: {e}")
                        time.sleep(3)
                # did anything go wrong?
                if os.path.getsize(pdf_name) < 1000:
                    print(f"Failed to download {pdf_name}")
                    os.remove(pdf_name)
                else:
                    print(f"Downloaded {pdf_name}") 
                break
    except Exception as e:
        print(f"An error occurred: {e}")    
    finally:
        driver.quit()

 # Example usage
 with open("dois.txt", "r") as doi_list:
    dois = doi_list.readlines()
    for doi in dois:
        download_pdf(doi)
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	import time
	import os

	def download_req(url, name):
	with requests.get(url) as r:
	# make sure the file handile is not already in existence
	if os.path.exists(name):
	prefix = name.split(".")[0]
	suffix = name.split(".")[1]
	n = 1
	while os.path.exists(f"{prefix}_{n}.{suffix}"):
	n += 1
	name = f"{prefix}_{n}.{suffix}"
	with open(f"{name}", "wb") as f:
	f.write(r.content)
	print(f"Downloaded {name}")

	# Function to download the PDF
	def download_pdf(doi):
	# Construct the URL for the DOI
	base_url = "https://sci-hub.se/" # Change to the actual site URL
	full_url = f"{base_url}{doi}"

	# Set up Selenium
	options = webdriver.ChromeOptions()
	options.add_argument("--headless=new");
	driver = webdriver.Chrome(options=options)

	try:
	# Open the page
	driver.get(full_url)
	time.sleep(3)
	# did we get a webpage?
	if "sci-hub" not in driver.current_url:
	print(f"Invalid DOI: {doi}")
	return

	# Get page source and create a BeautifulSoup object
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	# Find all buttons that match the criteria
	buttons = soup.find_all('button', onclick=True)
	# Loop through the buttons and click them
	for button in buttons:
	button_text = button.text.strip()
	if 'save' in button_text.lower(): # Modify this condition based on the button text you're targeting
	# Find the corresponding button element in the Selenium context
	button_element = driver.find_element(By.XPATH, f"//button[contains(text(), '{button_text}')]")
	# parse the onclick attribute to get the URL
	onclick = button['onclick']
	pdf_url = onclick.split("'")[1]
	# if present, remove the '?download=true' from the URL:
	if "?download=true" in pdf_url:
	pdf_url = pdf_url.replace("?download=true", "")
	# does the URL start with '//'? If so, add 'https:' to the beginning
	if pdf_url.startswith("//"):
	pdf_url = f"https:{pdf_url}"
	else:
	pdf_url = f"{base_url}{pdf_url}"
	print(f"Downloading from {pdf_url}")
	# use the last portion of the URL as the filename
	pdf_name = pdf_url.split("/")[-1]
	print(f"Saving as {pdf_name}")
	# Download the PDF
	finished = False
	while not finished:
	try:
	download_req(pdf_url, pdf_name)
	finished = True
	except Exception as e:
	print(f"An error occurred: {e}")
	time.sleep(3)
	# did anything go wrong?
	if os.path.getsize(pdf_name) < 1000:
	print(f"Failed to download {pdf_name}")
	os.remove(pdf_name)
	else:
	print(f"Downloaded {pdf_name}")
	break
	except Exception as e:
	print(f"An error occurred: {e}")
	finally:
	driver.quit()

	# Example usage
	with open("dois.txt", "r") as doi_list:
	dois = doi_list.readlines()
	for doi in dois:
	download_pdf(doi)
No results found