Last active
May 20, 2019 18:13
-
-
Save mcprat/df20f09a06ba4249f3fad0776610f39d to your computer and use it in GitHub Desktop.
Sci-hub Helper Script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
#for Python 2 | |
# todo | |
# better commenting for each step of the process | |
# complete handling of captchas, a csv database of them | |
# possibility of getting citation from google scholar instead | |
# searching using author and title from RawCitation split parts all lower() | |
# getting HTML with format: https://scholar.google.com/scholar?q=QUERY&hl=en&as_sdt=0,39#d=gs_cit&u=%2Fscholar%3Fq%3Dinfo%3AYTlLL1HHpUwJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D0%26hl%3Den | |
# detect, get, and solve captcha | |
# reload if APA citation element is small | |
# grab the APA citation element: <div tabindex="0" class="gs_citr"> | |
# which is inside the element: <th scope="row" class="gs_cith">APA</th> | |
# done | |
# renaming the downloaded file | |
# getting the citation from google scholar (or fixing the one given) | |
# have a list of links completed, store in another text file | |
# put all the printable items into a file, comma delimited | |
# wrap inside a for loop | |
# making a list from a file with many links | |
# make python check for requests, bs4, and html5lib, and install them if not present | |
# make changes to prevent sci-hub captcha and maintain session cookies | |
# interface to handle captchas | |
# logic to detect when captcha appears, then to wait for human to solve | |
# possibility of generating the APA citation from wiley | |
# simply taking the pieces from wiley, authors, title, journal, issue, volume, DOI | |
# HTML for citation example: https://onlinelibrary.wiley.com/action/showCitFormats?doi=10.1901%2Fjaba.2009.42-741 | |
# element for citation on screen: | |
# OR | |
# element for button to download txt file: <input onclick="onCitMgrSubmit()" class="btn" type="submit" name="submit" value="Download"> | |
# from meta tags instead of divs | |
from time import sleep | |
from io import BytesIO | |
import random | |
try: | |
from PIL import Image | |
except ImportError: | |
print 'installing Pillow module' | |
import pip | |
pip.main(['install', pillow]) | |
from PIL import Image | |
try: | |
import requests | |
except ImportError: | |
print 'installing requests module' | |
import pip | |
pip.main(['install', requests]) | |
import requests | |
try: | |
from BeautifulSoup import BeautifulSoup | |
except ImportError: | |
try: | |
from bs4 import BeautifulSoup | |
except ImportError: | |
print 'installing bs4 and html5lib' | |
import pip | |
pip.main(['install', bs4]) | |
pip.main(['install', html5lib]) | |
from bs4 import BeautifulSoup | |
# initial toggle for captcha, assume we start with no captcha | |
captchaElement = None | |
# get links from file | |
try: | |
URLlist = open('links.txt').readlines() | |
# catch lack of file and give instructions | |
except IOError: | |
print('You need to make a file called "links.txt" in the same folder as the script\n\ | |
and fill it with links from "https://onlinelibrary.wiley.com/doi/abs"\n\ | |
one for each line (aka separated by "\\n")') | |
exit(1) | |
#print URLlist | |
# maintain all session data from sci-hub for all requests to sci-hub | |
SCIHUBsesh = requests.Session() | |
# for loop to do this for all URL in URLlist | |
for URL in URLlist: | |
# catch blank URL | |
# which might cause it to error when actually done | |
if len(URL) < 6: | |
print 'DONE!' | |
exit(0) | |
# given links from wiley, line delimited, spliced to remove '\n' | |
# alternatively, use links from the DOI.csv and skip parts of script | |
# for redoing certain sections | |
URL = URL[:-1] | |
# override list with example for testing | |
#URL = 'https://onlinelibrary.wiley.com/doi/abs/10.1901/jaba.2009.42-741' | |
# split URL into list of parts by / | |
SplitURL = URL.split('/') | |
DOIHTML = requests.get(URL).text | |
#print HTML | |
DOIHTMLparsed = BeautifulSoup(DOIHTML, features="html5lib") | |
DOIURL = DOIHTMLparsed.body.find('a', attrs={'class':'epub-doi'}).text | |
#print DOIURL | |
URLparts = ['https://sci-hub.tw/', DOIURL] | |
SCIHUBURL = ''.join(URLparts) | |
#print SCIHUBURL | |
SCIHUBHTML = SCIHUBsesh.get(SCIHUBURL).text | |
SCIHUBHTMLparsed = BeautifulSoup(SCIHUBHTML, features="html5lib") | |
DLelement = SCIHUBHTMLparsed.body.find('a', href=True, onclick=True) | |
#print DLelement | |
DLURL = str(DLelement.attrs['onclick'])[15:-1] | |
#print DLURL | |
# split up the URL into parts by / | |
SplitDLURL = DLURL.split('/') | |
fulldate = DOIHTMLparsed.body.find('span', attrs={'class':'epub-date'}).text | |
namedate = fulldate.split(' ', 2)[-1] | |
try: | |
author = DOIHTMLparsed.body.find('a', attrs={'data-id':'a1'}).text | |
nameauthor = ''.join(i for i in author if not i.isdigit()) | |
# handle (remove) first name and middle initial | |
# else: handle first name | |
authorlast = nameauthor.split(' ')[-1] | |
namelist = [authorlast, ' ', '(', namedate, ')', '.pdf'] | |
filename = ''.join(namelist) | |
except AttributeError: | |
# handle lack of author info, name file with original name from URL | |
namelist = [fulldate, ' ', DLURL[:-14].split('/', 5)[-1]] | |
filename = ''.join(namelist) | |
print '\nThis document has no author: \n',URL,'\nusing original filename...',filename,'\n' | |
pass | |
#print filename | |
# get webpage for direct PDF link | |
DLHTML = SCIHUBsesh.get(DLURL) | |
# handle HTML 500 server error | |
# try switching between list of server names | |
# instead of detecting a unique one, randomly switch until success | |
# this is bound to trigger a captcha, as its without session data | |
serverlist = ['https://cyber.sci-hub.tw/', 'https://zero.sci-hub.tw/', | |
'https://dacemirror.sci-hub.tw/', 'https://moscow.sci-hub.tw/'] | |
oldDLURL = DLURL | |
# alternative to while True: with try, except/continue, break | |
# simply check if status code is successful (200) | |
while DLHTML.status_code != 200: | |
currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/']) | |
newserver = random.choice(serverlist) | |
while currentserver == newserver: | |
newserver = random.choice(serverlist) | |
DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]]) | |
DLHTML = SCIHUBsesh.get(DLURL) | |
# initial test for captcha element existence | |
# if not present, returns NoneType | |
DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib") | |
captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'}) | |
# interactive captcha handling while loop | |
# captcha exists when variable has a value | |
while captchaElement is not None: | |
# handle captcha: get and show image, ask for input, | |
# HTTP POST input, set captchaElement is None, | |
# check if captcha failed or was successful, reload/continue vs break | |
# all in a while captchaElement != False loop | |
IMGID = str(captchaElement.attrs['src'][5:-4]) | |
IMGfilename = str(captchaElement.attrs['src'][5:]) | |
IMGsrc = str(captchaElement.attrs['src']) | |
IMGURLlist = [SplitDLURL[0], '//', SplitDLURL[1], SplitDLURL[2], IMGsrc] | |
IMGURL = ''.join(IMGURLlist) | |
IMGHTML = SCIHUBsesh.get(IMGURL) | |
origIMGHTML = IMGHTML | |
# Catch server problems with delivering captcha | |
# switch servers until we get a good one | |
while IMGHTML.status_code != 200: | |
currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/']) | |
newserver = random.choice(serverlist) | |
while currentserver == newserver: | |
newserver = random.choice(serverlist) | |
newIMGURL = ''.join([newserver, IMGsrc]) | |
DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]]) | |
DLHTML = SCIHUBsesh.get(DLURL) | |
IMGHTML = SCIHUBsesh.get(newIMGURL) | |
sleep(5) | |
IMGdata = IMGHTML.content | |
sleep(2) | |
# show the captcha | |
# does not work with a 'with __ as __:' statement | |
Image.open(BytesIO(IMGdata)).show() | |
# ask for input of Captcha answer | |
CaptchaAns = str(raw_input('type captcha solution and hit Enter: ')) | |
# attempt to send captcha answer | |
# not sure if only the answer is posted, and then reload | |
# or if there is a way to trigger the button | |
POST = SCIHUBsesh.post(DLURL, data = {'id':IMGID, 'answer':CaptchaAns}) | |
# reload and check if captcha completed | |
# if it is passed, captchaElement will be NoneType | |
# and then loop will break | |
# if not, variable will have value, loop repeats | |
DLHTML = SCIHUBsesh.get(DLURL) | |
DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib") | |
captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'}) | |
# end of captcha loops | |
# render binary data from HTML grab into a variable | |
pdfdata = DLHTML.content | |
sleep(5) | |
# write binary PDF data to file in binary mode | |
with open(filename, 'w+b') as pdf: | |
pdf.write(pdfdata) | |
pdf.close() | |
print '\ndownloaded pdf ',filename | |
# get citation from Wiley instead of sci-hub | |
RawCitation = DOIHTMLparsed.head.find('meta', attrs={'name':'article_references'}).attrs['content'] | |
IssueNum = DOIHTMLparsed.head.find('meta', attrs={'name':'citation_issue'}).attrs['content'] | |
#RawCitationparts = [RawCitationElement.contents[0], RawCitationElement.contents[1].contents[0]] | |
#RawCitation = ''.join(RawCitationparts) | |
#RawCitation = RawCitationHTMLparsed.body.find('div', attrs={'class': None}, text=re.compile('.*doi:.*')) | |
#RawCitation = RawCitationHTML.text | |
#print RawCitation | |
# pattern: after ').', first word capitalized, rest uncap until '.' | |
# optionally: then, italicized until ',' | |
# catch situation where citation is not given or invalid size | |
# Handle incorrect citation order and format, and missing Issue number | |
# solves all format issues in one go with string splitting and joining | |
if len(RawCitation) > 10: | |
part1 = RawCitation.split(')')[0] | |
firstword = RawCitation.split(')')[1].split(' ', 2)[1] | |
part2 = firstword.capitalize() | |
restoftitle = RawCitation.split(')')[1].split(' ', 2)[2].split('.', 1)[0] | |
part3 = restoftitle.lower() | |
part4 = RawCitation.split(')')[1].split(' ', 2)[-1].split('.', 1)[1] | |
#insert Issue Number with parentheses | |
part4Split = part4.split(':') | |
fixedpart4parts = [part4Split[0], '(', IssueNum, '),', part4Split[1], ':', part4Split[2]] | |
fixedpart4 = ''.join(fixedpart4parts) | |
# catch situation where, if no author, title goes first | |
if len(part1) > 8: | |
Citationparts = [part1, '). ', part2, ' ', part3, '.', fixedpart4] | |
else: | |
Citationparts = [part2, ' ', part3, '.', part1, '). ', fixedpart4] | |
# rejoin after joining list to ensure no extra whitespace chars | |
fixedCitation = ' '.join(''.join(Citationparts).split()) | |
else: | |
fixedCitation = '' | |
#print fixedCitation | |
with open('DOI.csv', 'a') as csv: | |
csv.write(filename.encode('utf8')) | |
csv.write(';') | |
csv.write(DOIURL.encode('utf8')) | |
csv.write(';') | |
csv.write(fixedCitation.encode('utf8')) | |
csv.write(';') | |
csv.write(SCIHUBURL.encode('utf8')) | |
csv.write(';') | |
csv.write(DLURL.encode('utf8')) | |
csv.write(';') | |
csv.write(URL.encode('utf8')) | |
csv.write('\n') | |
csv.close() | |
print 'links and citation written for ',filename,'\n' | |
# end of main loop | |
print 'DONE' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Take links from "https://onlinelibrary.wiley.com/doi/abs" to pass DOI through Sci-hub.tw
Get PDF from Sci-hub and citation from Wiley
Adaptable to start with DOI links
Prerequisites:
the latest version of Python 2
install the following modules:
pillow, requests, bs4, html5lib
Instructions:
Get the latest python 2
install prerequisite modules (this script might do it for you)
Make a folder where you want everything to end up
download this script into that folder
make a file "links.txt" in that folder
fill the text file with links, save, and close
right click script, open script in IDLE and run (F5)