mcprat · May 20, 2019 18:13 · mcprat · May 18, 2019
diff --git a/doi2pdf.py b/doi2pdf.py
 # -*- coding: utf-8 -*-
 #for Python 2

 # todo
 # better commenting for each step of the process
 # complete handling of captchas, a csv database of them
 # possibility of getting citation from google scholar instead
 #   searching using author and title from RawCitation split parts all lower()
 #   getting HTML with format: https://scholar.google.com/scholar?q=QUERY&hl=en&as_sdt=0,39#d=gs_cit&u=%2Fscholar%3Fq%3Dinfo%3AYTlLL1HHpUwJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D0%26hl%3Den
 #   detect, get, and solve captcha
 #   reload if APA citation element is small
 #   grab the APA citation element: <div tabindex="0" class="gs_citr">
 #   which is inside the element: <th scope="row" class="gs_cith">APA</th>


 # done
 # renaming the downloaded file
 # getting the citation from google scholar (or fixing the one given)
 # have a list of links completed, store in another text file
 # put all the printable items into a file, comma delimited
 # wrap inside a for loop
 # making a list from a file with many links
 # make python check for requests, bs4, and html5lib, and install them if not present
 # make changes to prevent sci-hub captcha and maintain session cookies
 # interface to handle captchas
 # logic to detect when captcha appears, then to wait for human to solve
 # possibility of generating the APA citation from wiley
 #   simply taking the pieces from wiley, authors, title, journal, issue, volume, DOI
 #   HTML for citation example: https://onlinelibrary.wiley.com/action/showCitFormats?doi=10.1901%2Fjaba.2009.42-741
 #   element for citation on screen:
 #   OR
 #   element for button to download txt file: <input onclick="onCitMgrSubmit()" class="btn" type="submit" name="submit" value="Download">
 #   from meta tags instead of divs


 from time import sleep

 from io import BytesIO

 import random

 try:
    from PIL import Image
 except ImportError:
    print 'installing Pillow module'
    import pip
    pip.main(['install', pillow])
    from PIL import Image

 try:
    import requests
 except ImportError:
    print 'installing requests module'
    import pip
    pip.main(['install', requests])
    import requests

 try:
    from BeautifulSoup import BeautifulSoup
 except ImportError:
    try:
        from bs4 import BeautifulSoup
    except ImportError:
        print 'installing bs4 and html5lib'
        import pip
        pip.main(['install', bs4])
        pip.main(['install', html5lib])
        from bs4 import BeautifulSoup




 # initial toggle for captcha, assume we start with no captcha
 captchaElement = None




 # get links from file
 try:
    URLlist = open('links.txt').readlines()

 # catch lack of file and give instructions
 except IOError:
    print('You need to make a file called "links.txt" in the same folder as the script\n\
 and fill it with links from "https://onlinelibrary.wiley.com/doi/abs"\n\
 one for each line (aka separated by "\\n")')
    exit(1)

 #print URLlist




 # maintain all session data from sci-hub for all requests to sci-hub
 SCIHUBsesh = requests.Session()




 # for loop to do this for all URL in URLlist
 for URL in URLlist:

    # catch blank URL
    # which might cause it to error when actually done
    if len(URL) < 6:
        print 'DONE!'
        exit(0)

    # given links from wiley, line delimited, spliced to remove '\n'
    # alternatively, use links from the DOI.csv and skip parts of script
    # for redoing certain sections
    URL = URL[:-1]


    # override list with example for testing
    #URL = 'https://onlinelibrary.wiley.com/doi/abs/10.1901/jaba.2009.42-741'


    # split URL into list of parts by /
    SplitURL = URL.split('/')




    DOIHTML = requests.get(URL).text
    #print HTML

    DOIHTMLparsed = BeautifulSoup(DOIHTML, features="html5lib")

    DOIURL = DOIHTMLparsed.body.find('a', attrs={'class':'epub-doi'}).text
    #print DOIURL

    URLparts = ['https://sci-hub.tw/', DOIURL]
    SCIHUBURL = ''.join(URLparts)
    #print SCIHUBURL




    SCIHUBHTML = SCIHUBsesh.get(SCIHUBURL).text

    SCIHUBHTMLparsed = BeautifulSoup(SCIHUBHTML, features="html5lib")

    DLelement = SCIHUBHTMLparsed.body.find('a', href=True, onclick=True)
    #print DLelement

    DLURL = str(DLelement.attrs['onclick'])[15:-1]
    #print DLURL

    # split up the URL into parts by /
    SplitDLURL = DLURL.split('/')




    fulldate = DOIHTMLparsed.body.find('span', attrs={'class':'epub-date'}).text
    namedate = fulldate.split(' ', 2)[-1]

    try:
        author = DOIHTMLparsed.body.find('a', attrs={'data-id':'a1'}).text
        nameauthor = ''.join(i for i in author if not i.isdigit())
        # handle (remove) first name and middle initial
        # else: handle first name
        authorlast = nameauthor.split(' ')[-1]
        namelist = [authorlast, ' ', '(', namedate, ')', '.pdf']
        filename = ''.join(namelist)
    except AttributeError:
        # handle lack of author info, name file with original name from URL
        namelist = [fulldate, ' ', DLURL[:-14].split('/', 5)[-1]]
        filename = ''.join(namelist)
        print '\nThis document has no author: \n',URL,'\nusing original filename...',filename,'\n'
        pass

    #print filename




    # get webpage for direct PDF link
    DLHTML = SCIHUBsesh.get(DLURL)

    # handle HTML 500 server error
    # try switching between list of server names
    # instead of detecting a unique one, randomly switch until success
    # this is bound to trigger a captcha, as its without session data
    serverlist = ['https://cyber.sci-hub.tw/', 'https://zero.sci-hub.tw/',
                  'https://dacemirror.sci-hub.tw/', 'https://moscow.sci-hub.tw/']

    oldDLURL = DLURL

    # alternative to while True: with try, except/continue, break
    # simply check if status code is successful (200)
    while DLHTML.status_code != 200:
        currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
        newserver = random.choice(serverlist)
        while currentserver == newserver:
            newserver = random.choice(serverlist)
        DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
        DLHTML = SCIHUBsesh.get(DLURL)




    # initial test for captcha element existence
    # if not present, returns NoneType
    DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
    captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})


    # interactive captcha handling while loop
    # captcha exists when variable has a value
    while captchaElement is not None:
        # handle captcha: get and show image, ask for input,
        # HTTP POST input, set captchaElement is None,
        # check if captcha failed or was successful, reload/continue vs break
        # all in a while captchaElement != False loop
        IMGID = str(captchaElement.attrs['src'][5:-4])
        IMGfilename = str(captchaElement.attrs['src'][5:])
        IMGsrc = str(captchaElement.attrs['src'])


        IMGURLlist = [SplitDLURL[0], '//', SplitDLURL[1], SplitDLURL[2], IMGsrc]
        IMGURL = ''.join(IMGURLlist)

        IMGHTML = SCIHUBsesh.get(IMGURL)

        origIMGHTML = IMGHTML


        # Catch server problems with delivering captcha
        # switch servers until we get a good one
        while IMGHTML.status_code != 200:
            currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
            newserver = random.choice(serverlist)
            while currentserver == newserver:
                newserver = random.choice(serverlist)
            newIMGURL = ''.join([newserver, IMGsrc])
            DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
            DLHTML = SCIHUBsesh.get(DLURL)
            IMGHTML = SCIHUBsesh.get(newIMGURL)

        sleep(5)

        IMGdata = IMGHTML.content

        sleep(2)
        
        # show the captcha
        # does not work with a 'with __ as __:' statement
        Image.open(BytesIO(IMGdata)).show()




        # ask for input of Captcha answer
        CaptchaAns = str(raw_input('type captcha solution and hit Enter: '))


        # attempt to send captcha answer
        # not sure if only the answer is posted, and then reload
        # or if there is a way to trigger the button
        POST = SCIHUBsesh.post(DLURL, data = {'id':IMGID, 'answer':CaptchaAns})



        # reload and check if captcha completed
        # if it is passed, captchaElement will be NoneType
        # and then loop will break
        # if not, variable will have value, loop repeats
        DLHTML = SCIHUBsesh.get(DLURL)
        DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
        captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})

    # end of captcha loops




    # render binary data from HTML grab into a variable
    pdfdata = DLHTML.content

    sleep(5)

    # write binary PDF data to file in binary mode
    with open(filename, 'w+b') as pdf:
        pdf.write(pdfdata)
        pdf.close()


    print '\ndownloaded pdf ',filename





    # get citation from Wiley instead of sci-hub


    RawCitation = DOIHTMLparsed.head.find('meta', attrs={'name':'article_references'}).attrs['content']
    IssueNum = DOIHTMLparsed.head.find('meta', attrs={'name':'citation_issue'}).attrs['content']

    #RawCitationparts = [RawCitationElement.contents[0], RawCitationElement.contents[1].contents[0]]
    #RawCitation = ''.join(RawCitationparts)

    #RawCitation = RawCitationHTMLparsed.body.find('div', attrs={'class': None}, text=re.compile('.*doi:.*'))
    #RawCitation = RawCitationHTML.text

    #print RawCitation




    # pattern: after ').', first word capitalized, rest uncap until '.'
    # optionally: then, italicized until ','


    # catch situation where citation is not given or invalid size
    # Handle incorrect citation order and format, and missing Issue number
    # solves all format issues in one go with string splitting and joining
    if len(RawCitation) > 10:

        part1 = RawCitation.split(')')[0]

        firstword = RawCitation.split(')')[1].split(' ', 2)[1]
        part2 = firstword.capitalize()

        restoftitle = RawCitation.split(')')[1].split(' ', 2)[2].split('.', 1)[0]
        part3 = restoftitle.lower()

        part4 = RawCitation.split(')')[1].split(' ', 2)[-1].split('.', 1)[1]

        #insert Issue Number with parentheses
        part4Split = part4.split(':')
        fixedpart4parts = [part4Split[0], '(', IssueNum, '),', part4Split[1], ':', part4Split[2]]
        fixedpart4 = ''.join(fixedpart4parts)

        # catch situation where, if no author, title goes first
        if len(part1) > 8:
            Citationparts = [part1, '). ', part2, ' ', part3, '.', fixedpart4]
        else:
            Citationparts = [part2, ' ', part3, '.', part1, '). ', fixedpart4]

        # rejoin after joining list to ensure no extra whitespace chars
        fixedCitation = ' '.join(''.join(Citationparts).split())

    else:
        fixedCitation = ''


    #print fixedCitation





    with open('DOI.csv', 'a') as csv:
            csv.write(filename.encode('utf8'))
            csv.write(';')
            csv.write(DOIURL.encode('utf8'))
            csv.write(';')
            csv.write(fixedCitation.encode('utf8'))
            csv.write(';')
            csv.write(SCIHUBURL.encode('utf8'))
            csv.write(';')
            csv.write(DLURL.encode('utf8'))
            csv.write(';')
            csv.write(URL.encode('utf8'))
            csv.write('\n')
            csv.close()

    print 'links and citation written for ',filename,'\n'

    # end of main loop

 print 'DONE'
	# -- coding: utf-8 --
	#for Python 2

	# todo
	# better commenting for each step of the process
	# complete handling of captchas, a csv database of them
	# possibility of getting citation from google scholar instead
	# searching using author and title from RawCitation split parts all lower()
	# getting HTML with format: https://scholar.google.com/scholar?q=QUERY&hl=en&as_sdt=0,39#d=gs_cit&u=%2Fscholar%3Fq%3Dinfo%3AYTlLL1HHpUwJ%3Ascholar.google.com%2F%26output%3Dcite%26scirp%3D0%26hl%3Den
	# detect, get, and solve captcha
	# reload if APA citation element is small
	# grab the APA citation element: <div tabindex="0" class="gs_citr">
	# which is inside the element: <th scope="row" class="gs_cith">APA</th>


	# done
	# renaming the downloaded file
	# getting the citation from google scholar (or fixing the one given)
	# have a list of links completed, store in another text file
	# put all the printable items into a file, comma delimited
	# wrap inside a for loop
	# making a list from a file with many links
	# make python check for requests, bs4, and html5lib, and install them if not present
	# make changes to prevent sci-hub captcha and maintain session cookies
	# interface to handle captchas
	# logic to detect when captcha appears, then to wait for human to solve
	# possibility of generating the APA citation from wiley
	# simply taking the pieces from wiley, authors, title, journal, issue, volume, DOI
	# HTML for citation example: https://onlinelibrary.wiley.com/action/showCitFormats?doi=10.1901%2Fjaba.2009.42-741
	# element for citation on screen:
	# OR
	# element for button to download txt file: <input onclick="onCitMgrSubmit()" class="btn" type="submit" name="submit" value="Download">
	# from meta tags instead of divs


	from time import sleep

	from io import BytesIO

	import random

	try:
	from PIL import Image
	except ImportError:
	print 'installing Pillow module'
	import pip
	pip.main(['install', pillow])
	from PIL import Image

	try:
	import requests
	except ImportError:
	print 'installing requests module'
	import pip
	pip.main(['install', requests])
	import requests

	try:
	from BeautifulSoup import BeautifulSoup
	except ImportError:
	try:
	from bs4 import BeautifulSoup
	except ImportError:
	print 'installing bs4 and html5lib'
	import pip
	pip.main(['install', bs4])
	pip.main(['install', html5lib])
	from bs4 import BeautifulSoup




	# initial toggle for captcha, assume we start with no captcha
	captchaElement = None




	# get links from file
	try:
	URLlist = open('links.txt').readlines()

	# catch lack of file and give instructions
	except IOError:
	print('You need to make a file called "links.txt" in the same folder as the script\n\
	and fill it with links from "https://onlinelibrary.wiley.com/doi/abs"\n\
	one for each line (aka separated by "\\n")')
	exit(1)

	#print URLlist




	# maintain all session data from sci-hub for all requests to sci-hub
	SCIHUBsesh = requests.Session()




	# for loop to do this for all URL in URLlist
	for URL in URLlist:

	# catch blank URL
	# which might cause it to error when actually done
	if len(URL) < 6:
	print 'DONE!'
	exit(0)

	# given links from wiley, line delimited, spliced to remove '\n'
	# alternatively, use links from the DOI.csv and skip parts of script
	# for redoing certain sections
	URL = URL[:-1]


	# override list with example for testing
	#URL = 'https://onlinelibrary.wiley.com/doi/abs/10.1901/jaba.2009.42-741'


	# split URL into list of parts by /
	SplitURL = URL.split('/')




	DOIHTML = requests.get(URL).text
	#print HTML

	DOIHTMLparsed = BeautifulSoup(DOIHTML, features="html5lib")

	DOIURL = DOIHTMLparsed.body.find('a', attrs={'class':'epub-doi'}).text
	#print DOIURL

	URLparts = ['https://sci-hub.tw/', DOIURL]
	SCIHUBURL = ''.join(URLparts)
	#print SCIHUBURL




	SCIHUBHTML = SCIHUBsesh.get(SCIHUBURL).text

	SCIHUBHTMLparsed = BeautifulSoup(SCIHUBHTML, features="html5lib")

	DLelement = SCIHUBHTMLparsed.body.find('a', href=True, onclick=True)
	#print DLelement

	DLURL = str(DLelement.attrs['onclick'])[15:-1]
	#print DLURL

	# split up the URL into parts by /
	SplitDLURL = DLURL.split('/')




	fulldate = DOIHTMLparsed.body.find('span', attrs={'class':'epub-date'}).text
	namedate = fulldate.split(' ', 2)[-1]

	try:
	author = DOIHTMLparsed.body.find('a', attrs={'data-id':'a1'}).text
	nameauthor = ''.join(i for i in author if not i.isdigit())
	# handle (remove) first name and middle initial
	# else: handle first name
	authorlast = nameauthor.split(' ')[-1]
	namelist = [authorlast, ' ', '(', namedate, ')', '.pdf']
	filename = ''.join(namelist)
	except AttributeError:
	# handle lack of author info, name file with original name from URL
	namelist = [fulldate, ' ', DLURL[:-14].split('/', 5)[-1]]
	filename = ''.join(namelist)
	print '\nThis document has no author: \n',URL,'\nusing original filename...',filename,'\n'
	pass

	#print filename




	# get webpage for direct PDF link
	DLHTML = SCIHUBsesh.get(DLURL)

	# handle HTML 500 server error
	# try switching between list of server names
	# instead of detecting a unique one, randomly switch until success
	# this is bound to trigger a captcha, as its without session data
	serverlist = ['https://cyber.sci-hub.tw/', 'https://zero.sci-hub.tw/',
	'https://dacemirror.sci-hub.tw/', 'https://moscow.sci-hub.tw/']

	oldDLURL = DLURL

	# alternative to while True: with try, except/continue, break
	# simply check if status code is successful (200)
	while DLHTML.status_code != 200:
	currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
	newserver = random.choice(serverlist)
	while currentserver == newserver:
	newserver = random.choice(serverlist)
	DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
	DLHTML = SCIHUBsesh.get(DLURL)




	# initial test for captcha element existence
	# if not present, returns NoneType
	DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
	captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})


	# interactive captcha handling while loop
	# captcha exists when variable has a value
	while captchaElement is not None:
	# handle captcha: get and show image, ask for input,
	# HTTP POST input, set captchaElement is None,
	# check if captcha failed or was successful, reload/continue vs break
	# all in a while captchaElement != False loop
	IMGID = str(captchaElement.attrs['src'][5:-4])
	IMGfilename = str(captchaElement.attrs['src'][5:])
	IMGsrc = str(captchaElement.attrs['src'])


	IMGURLlist = [SplitDLURL[0], '//', SplitDLURL[1], SplitDLURL[2], IMGsrc]
	IMGURL = ''.join(IMGURLlist)

	IMGHTML = SCIHUBsesh.get(IMGURL)

	origIMGHTML = IMGHTML


	# Catch server problems with delivering captcha
	# switch servers until we get a good one
	while IMGHTML.status_code != 200:
	currentserver = ''.join([SplitDLURL[0], '//', SplitDLURL[2], '/'])
	newserver = random.choice(serverlist)
	while currentserver == newserver:
	newserver = random.choice(serverlist)
	newIMGURL = ''.join([newserver, IMGsrc])
	DLURL = ''.join([newserver, SplitDLURL[3], '/', SplitDLURL[4], '/', SplitDLURL[5]])
	DLHTML = SCIHUBsesh.get(DLURL)
	IMGHTML = SCIHUBsesh.get(newIMGURL)

	sleep(5)

	IMGdata = IMGHTML.content

	sleep(2)

	# show the captcha
	# does not work with a 'with __ as __:' statement
	Image.open(BytesIO(IMGdata)).show()




	# ask for input of Captcha answer
	CaptchaAns = str(raw_input('type captcha solution and hit Enter: '))


	# attempt to send captcha answer
	# not sure if only the answer is posted, and then reload
	# or if there is a way to trigger the button
	POST = SCIHUBsesh.post(DLURL, data = {'id':IMGID, 'answer':CaptchaAns})



	# reload and check if captcha completed
	# if it is passed, captchaElement will be NoneType
	# and then loop will break
	# if not, variable will have value, loop repeats
	DLHTML = SCIHUBsesh.get(DLURL)
	DLHTMLparsed = BeautifulSoup(DLHTML.text, features="html5lib")
	captchaElement = DLHTMLparsed.body.find('img', attrs={'id':'captcha'})

	# end of captcha loops




	# render binary data from HTML grab into a variable
	pdfdata = DLHTML.content

	sleep(5)

	# write binary PDF data to file in binary mode
	with open(filename, 'w+b') as pdf:
	pdf.write(pdfdata)
	pdf.close()


	print '\ndownloaded pdf ',filename





	# get citation from Wiley instead of sci-hub


	RawCitation = DOIHTMLparsed.head.find('meta', attrs={'name':'article_references'}).attrs['content']
	IssueNum = DOIHTMLparsed.head.find('meta', attrs={'name':'citation_issue'}).attrs['content']

	#RawCitationparts = [RawCitationElement.contents[0], RawCitationElement.contents[1].contents[0]]
	#RawCitation = ''.join(RawCitationparts)

	#RawCitation = RawCitationHTMLparsed.body.find('div', attrs={'class': None}, text=re.compile('.doi:.'))
	#RawCitation = RawCitationHTML.text

	#print RawCitation




	# pattern: after ').', first word capitalized, rest uncap until '.'
	# optionally: then, italicized until ','


	# catch situation where citation is not given or invalid size
	# Handle incorrect citation order and format, and missing Issue number
	# solves all format issues in one go with string splitting and joining
	if len(RawCitation) > 10:

	part1 = RawCitation.split(')')[0]

	firstword = RawCitation.split(')')[1].split(' ', 2)[1]
	part2 = firstword.capitalize()

	restoftitle = RawCitation.split(')')[1].split(' ', 2)[2].split('.', 1)[0]
	part3 = restoftitle.lower()

	part4 = RawCitation.split(')')[1].split(' ', 2)[-1].split('.', 1)[1]

	#insert Issue Number with parentheses
	part4Split = part4.split(':')
	fixedpart4parts = [part4Split[0], '(', IssueNum, '),', part4Split[1], ':', part4Split[2]]
	fixedpart4 = ''.join(fixedpart4parts)

	# catch situation where, if no author, title goes first
	if len(part1) > 8:
	Citationparts = [part1, '). ', part2, ' ', part3, '.', fixedpart4]
	else:
	Citationparts = [part2, ' ', part3, '.', part1, '). ', fixedpart4]

	# rejoin after joining list to ensure no extra whitespace chars
	fixedCitation = ' '.join(''.join(Citationparts).split())

	else:
	fixedCitation = ''


	#print fixedCitation





	with open('DOI.csv', 'a') as csv:
	csv.write(filename.encode('utf8'))
	csv.write(';')
	csv.write(DOIURL.encode('utf8'))
	csv.write(';')
	csv.write(fixedCitation.encode('utf8'))
	csv.write(';')
	csv.write(SCIHUBURL.encode('utf8'))
	csv.write(';')
	csv.write(DLURL.encode('utf8'))
	csv.write(';')
	csv.write(URL.encode('utf8'))
	csv.write('\n')
	csv.close()

	print 'links and citation written for ',filename,'\n'

	# end of main loop

	print 'DONE'