Created
July 6, 2024 19:28
-
-
Save fronbasal/d2e3688016b4b07fd27647aaf9c4f142 to your computer and use it in GitHub Desktop.
SciHub downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import requests | |
from bs4 import BeautifulSoup | |
# Inspired by https://github.com/stormsuresh92/Bulk-download-Sci-Hub-papers | |
class SciHub: | |
def __init__(self, doi: str, path: Path, url='https://sci-hub.se/', timeout=60): | |
self.url = url | |
self.timeout = timeout | |
self.path = path | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36', | |
} | |
self.payload = { | |
'sci-hub-plugin-check': '', | |
'request': str(doi) | |
} | |
def _send_request(self): | |
res = requests.post(self.url, headers=self.headers, data=self.payload, timeout=self.timeout) | |
assert res.ok, 'Failed to fetch %s, status code: %d' % (self.url, res.status_code) | |
return res | |
def _extract_url(self, response): | |
soup = BeautifulSoup(response.content, 'html.parser') | |
try: | |
content_url = soup.find(id='pdf').get('src').replace('#navpanes=0&view=FitH', '').replace('//', '/') | |
if not content_url.endswith('.pdf'): | |
raise AttributeError() | |
except AttributeError: | |
print("Failed to find ", self.payload['request']) | |
return | |
if content_url.startswith('/downloads'): | |
return 'https://sci-hub.se' + content_url | |
elif content_url.startswith('/tree'): | |
return 'https://sci-hub.se' + content_url | |
elif content_url.startswith('/uptodate'): | |
return 'https://sci-hub.se' + content_url | |
else: | |
return 'https:/' + content_url | |
def fetch(self): | |
response = self._send_request() | |
pdf_url = self._extract_url(response) | |
if pdf_url is None: | |
return | |
pdf_name = pdf_url.split('/')[-1] | |
self.path.joinpath(pdf_name).write_bytes(requests.get(pdf_url).content) | |
print("Downloaded %s" % pdf_name) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment