Skip to content

Instantly share code, notes, and snippets.

@mcprat
Last active May 20, 2019 18:13
Show Gist options
  • Save mcprat/df20f09a06ba4249f3fad0776610f39d to your computer and use it in GitHub Desktop.
Save mcprat/df20f09a06ba4249f3fad0776610f39d to your computer and use it in GitHub Desktop.
Sci-hub Helper Script
# -*- coding: utf-8 -*-
#for Python 2
# todo
# better commenting for each step of the process
# complete handling of captchas, a csv database of them
# possibility of getting citation from google scholar instead
# searching using author and title from RawCitation split parts all lower()
# getting HTML with format: https://scholar.google.com/scholar?q=QUERY&hl=en&as_sdt=0,39#d=gs_cit&u=%2Fscholar%3Fq%3Dinfo%3AYTlLL1HHpUwJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D0%26hl%3Den
# detect, get, and solve captcha
# reload if APA citation element is small
# grab the APA citation element: <div tabindex="0" class="gs_citr">
# which is inside the element: <th scope="row" class="gs_cith">APA</th>
# done
# renaming the downloaded file
# getting the citation from google scholar (or fixing the one given)
# have a list of links completed, store in another text file
# put all the printable items into a file, comma delimited
# wrap inside a for loop
# making a list from a file with many links
# make python check for requests, bs4, and html5lib, and install them if not present
# make changes to prevent sci-hub captcha and maintain session cookies
# interface to handle captchas
# logic to detect when captcha appears, then to wait for human to solve
# possibility of generating the APA citation from wiley
# simply taking the pieces from wiley, authors, title, journal, issue, volume, DOI
# HTML for citation example: https://onlinelibrary.wiley.com/action/showCitFormats?doi=10.1901%2Fjaba.2009.42-741
# element for citation on screen:
# OR
# element for button to download txt file: <input onclick="onCitMgrSubmit()" class="btn" type="submit" name="submit" value="Download">
# from meta tags instead of divs
from time import sleep
from io import BytesIO
import random
try:
from PIL import Image
except ImportError:
print 'installing Pillow module'
import pip
pip.main(['install', pillow])
from PIL import Image
try:
import requests
except ImportError:
print 'installing requests module'
import pip
pip.main(['install', requests])
import requests
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
try:
from bs4 import BeautifulSoup
except ImportError:
print 'installing bs4 and html5lib'
import pip
pip.main(['install', bs4])
pip.main(['install', html5lib])
from bs4 import BeautifulSoup
# initial toggle for captcha, assume we start with no captcha
captchaElement = None
# get links from file
try:
URLlist = open('links.txt').readlines()
# catch lack of file and give instructions
except IOError:
print('You need to make a file called "links.txt" in the same folder as the script\n\
and fill it with links from "https://onlinelibrary.wiley.com/doi/abs"\n\
one for each line (aka separated by "\\n")')
exit(1)
#print URLlist
# maintain all session data from sci-hub for all requests to sci-hub
SCIHUBsesh = requests.Session()
# for loop to do this for all URL in URLlist
for URL in URLlist:
# catch blank URL
# which might cause it to error when actually done
if len(URL) < 6:
print 'DONE!'
exit(0)
# given links from wiley, line delimited, spliced to remove '\n'
# alternatively, use links from the DOI.csv and skip parts of script
# for redoing certain sections
URL = URL[:-1]
# override list with example for testing
#URL = 'https://onlinelibrary.wiley.com/doi/abs/10.1901/jaba.2009.42-741'
# split URL into list of parts by /
SplitURL = URL.split('/')
DOIHTML = requests.get(URL).text
#print HTML
DOIHTMLparsed = BeautifulSoup(DOIHTML, features="html5lib")
DOIURL = DOIHTMLparsed.body.find('a', attrs={'class':'epub-doi'}).text
#print DOIURL
URLparts = ['https://sci-hub.tw/', DOIURL]
SCIHUBURL = ''.join(URLparts)
#print SCIHUBURL
SCIHUBHTML = SCIHUBsesh.get(SCIHUBURL).text
SCIHUBHTMLparsed = BeautifulSoup(SCIHUBHTML, features="html5lib")
DLelement = SCIHUBHTMLparsed.body.find('a', href=True, onclick=True)
#print DLelement
DLURL = str(DLelement.attrs['onclick'])[15:-1]
#print DLURL
# split up the URL into parts by /
SplitDLURL = DLURL.split('/')
fulldate = DOIHTMLparsed.body.find('span', attrs={'class':'epub-date'}).text
namedate = fulldate.split(' ', 2)[-1]
try:
author = DOIHTMLparsed.body.find('a', attrs={'data-id':'a1'}).text
nameauthor = ''.join(i for i in author if not i.isdigit())
# handle (remove) first name and middle initial
# else: handle first name
authorlast = nameauthor.split(' ')[-1]
namelist = [authorlast, ' ', '(', namedate, ')', '.pdf']
filename = ''.join(namelist)
except AttributeError:
# handle lack of author info, name file with original name from URL
namelist = [fulldate, ' ', DLURL[:-14].split('/', 5)[-1]]
filename = ''.join(namelist)
print '\nThis document has no author: \n',URL,'\nusing original filename...',filename,'\n'
pass
#print filename
# get webpage for direct PDF link
DLHTML = SCIHUBsesh.get(DLURL)
# handle HTML 500 server error
# try switching between list of server names
# instead of detecting a unique one, randomly switch until success
# this is bound to trigger a captcha, as its without session data
serverlist = ['https://cyber.sci-hub.tw/', 'https://zero.sci-hub.tw/',
'https://dacemirror.sci-hub.tw/', 'https://moscow.sci-hub.tw/']
oldDLURL = DLURL
# alternative to while True: with try, except/continue, break
# simply check if status code is successful (200)
while DLHTML.status_code != 200:
currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
newserver = random.choice(serverlist)
while currentserver == newserver:
newserver = random.choice(serverlist)
DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
DLHTML = SCIHUBsesh.get(DLURL)
# initial test for captcha element existence
# if not present, returns NoneType
DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})
# interactive captcha handling while loop
# captcha exists when variable has a value
while captchaElement is not None:
# handle captcha: get and show image, ask for input,
# HTTP POST input, set captchaElement is None,
# check if captcha failed or was successful, reload/continue vs break
# all in a while captchaElement != False loop
IMGID = str(captchaElement.attrs['src'][5:-4])
IMGfilename = str(captchaElement.attrs['src'][5:])
IMGsrc = str(captchaElement.attrs['src'])
IMGURLlist = [SplitDLURL[0], '//', SplitDLURL[1], SplitDLURL[2], IMGsrc]
IMGURL = ''.join(IMGURLlist)
IMGHTML = SCIHUBsesh.get(IMGURL)
origIMGHTML = IMGHTML
# Catch server problems with delivering captcha
# switch servers until we get a good one
while IMGHTML.status_code != 200:
currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
newserver = random.choice(serverlist)
while currentserver == newserver:
newserver = random.choice(serverlist)
newIMGURL = ''.join([newserver, IMGsrc])
DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
DLHTML = SCIHUBsesh.get(DLURL)
IMGHTML = SCIHUBsesh.get(newIMGURL)
sleep(5)
IMGdata = IMGHTML.content
sleep(2)
# show the captcha
# does not work with a 'with __ as __:' statement
Image.open(BytesIO(IMGdata)).show()
# ask for input of Captcha answer
CaptchaAns = str(raw_input('type captcha solution and hit Enter: '))
# attempt to send captcha answer
# not sure if only the answer is posted, and then reload
# or if there is a way to trigger the button
POST = SCIHUBsesh.post(DLURL, data = {'id':IMGID, 'answer':CaptchaAns})
# reload and check if captcha completed
# if it is passed, captchaElement will be NoneType
# and then loop will break
# if not, variable will have value, loop repeats
DLHTML = SCIHUBsesh.get(DLURL)
DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})
# end of captcha loops
# render binary data from HTML grab into a variable
pdfdata = DLHTML.content
sleep(5)
# write binary PDF data to file in binary mode
with open(filename, 'w+b') as pdf:
pdf.write(pdfdata)
pdf.close()
print '\ndownloaded pdf ',filename
# get citation from Wiley instead of sci-hub
RawCitation = DOIHTMLparsed.head.find('meta', attrs={'name':'article_references'}).attrs['content']
IssueNum = DOIHTMLparsed.head.find('meta', attrs={'name':'citation_issue'}).attrs['content']
#RawCitationparts = [RawCitationElement.contents[0], RawCitationElement.contents[1].contents[0]]
#RawCitation = ''.join(RawCitationparts)
#RawCitation = RawCitationHTMLparsed.body.find('div', attrs={'class': None}, text=re.compile('.*doi:.*'))
#RawCitation = RawCitationHTML.text
#print RawCitation
# pattern: after ').', first word capitalized, rest uncap until '.'
# optionally: then, italicized until ','
# catch situation where citation is not given or invalid size
# Handle incorrect citation order and format, and missing Issue number
# solves all format issues in one go with string splitting and joining
if len(RawCitation) > 10:
part1 = RawCitation.split(')')[0]
firstword = RawCitation.split(')')[1].split(' ', 2)[1]
part2 = firstword.capitalize()
restoftitle = RawCitation.split(')')[1].split(' ', 2)[2].split('.', 1)[0]
part3 = restoftitle.lower()
part4 = RawCitation.split(')')[1].split(' ', 2)[-1].split('.', 1)[1]
#insert Issue Number with parentheses
part4Split = part4.split(':')
fixedpart4parts = [part4Split[0], '(', IssueNum, '),', part4Split[1], ':', part4Split[2]]
fixedpart4 = ''.join(fixedpart4parts)
# catch situation where, if no author, title goes first
if len(part1) > 8:
Citationparts = [part1, '). ', part2, ' ', part3, '.', fixedpart4]
else:
Citationparts = [part2, ' ', part3, '.', part1, '). ', fixedpart4]
# rejoin after joining list to ensure no extra whitespace chars
fixedCitation = ' '.join(''.join(Citationparts).split())
else:
fixedCitation = ''
#print fixedCitation
with open('DOI.csv', 'a') as csv:
csv.write(filename.encode('utf8'))
csv.write(';')
csv.write(DOIURL.encode('utf8'))
csv.write(';')
csv.write(fixedCitation.encode('utf8'))
csv.write(';')
csv.write(SCIHUBURL.encode('utf8'))
csv.write(';')
csv.write(DLURL.encode('utf8'))
csv.write(';')
csv.write(URL.encode('utf8'))
csv.write('\n')
csv.close()
print 'links and citation written for ',filename,'\n'
# end of main loop
print 'DONE'
@mcprat
Copy link
Author

mcprat commented May 18, 2019

Take links from "https://onlinelibrary.wiley.com/doi/abs" to pass DOI through Sci-hub.tw
Get PDF from Sci-hub and citation from Wiley
Adaptable to start with DOI links

Prerequisites:
the latest version of Python 2
install the following modules:
pillow, requests, bs4, html5lib

Instructions:
Get the latest python 2
install prerequisite modules (this script might do it for you)

Make a folder where you want everything to end up
download this script into that folder
make a file "links.txt" in that folder
fill the text file with links, save, and close
right click script, open script in IDLE and run (F5)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment