Last active
October 12, 2019 15:30
-
-
Save zehsilva/d890837cdfd4a20ee8a5 to your computer and use it in GitHub Desktop.
Download books from Springer (RD) passing only the url of the search result or the book page to the script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" @copyleft Eliezer Silva ([email protected]) | |
This script was developed mostly for recreative and educational purpose, so you use it on your own risk and as it is. | |
With this script you can crawl in springer book search results and download all the book listed in the page. This version also | |
checks if the result set includes many pages and navigate through those pages. | |
There's two ways of using it: | |
1) Link to a single book ulr in springer page: | |
python download-books-from-rss-springer.py 'http://rd.springer.com/book/10.1007/978-3-662-07003-1' | |
2) Link to a search result with many books: | |
python download-books-from-rss-springer.py -s 'http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false' | |
""" | |
import feedparser | |
import urllib | |
import re | |
from HTMLParser import HTMLParser | |
from urlparse import urlparse | |
import urllib2 | |
import sys | |
def downloader(url,filename=None): | |
file_name = filename | |
if(filename==None): | |
file_name = url.split('/')[-1] | |
u = urllib2.urlopen(url) | |
f = open(file_name, 'wb') | |
meta = u.info() | |
file_size = int(meta.getheaders("Content-Length")[0]) | |
print "Downloading: %s Bytes to %s" % ( file_size, file_name) | |
file_size_dl = 0 | |
block_sz = 8192 | |
parcels = file_size/30 | |
disp = False | |
c=1 | |
while True: | |
buffer = u.read(block_sz) | |
if not buffer: | |
break | |
file_size_dl += len(buffer) | |
f.write(buffer) | |
status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) | |
status = status + chr(8)*(len(status)+1) | |
#print status, | |
if (file_size_dl > c*parcels): | |
c+=1 | |
disp=True | |
if(disp == True): | |
print "_"+r"%3.2f%%" % (file_size_dl * 100. / file_size), | |
disp=False | |
print "" | |
f.close() | |
class myhtmlparser(HTMLParser): | |
def __init__(self,link): | |
self.reset() | |
self.inLink=False | |
self.domain=self.get_domain(link) | |
self.pdflink="" | |
self.bookname=None | |
self.is_searching=True | |
self.inyear=False | |
self.year="" | |
self.author_list=[] | |
self.inauthorlist=False | |
self.inauth=False | |
def handle_starttag(self, tag, attrs): | |
if(tag == 'span' and ('class','copyright-year') in attrs): | |
self.inyear=True | |
if(tag == 'a' and self.find_link(attrs) and self.is_searching): | |
self.inLink=True | |
if(tag == 'div' and (('class','author-list') in attrs or ('class','editor-list') in attrs)): | |
self.inauthorlist=True | |
if(tag == 'a' and ('itemprop','name') in attrs and self.inauthorlist): | |
self.inauth=True | |
def handle_endtag(self,tag): | |
if(tag == 'div' and self.inauthorlist): | |
self.inauthorlist=False | |
def handle_data(self, data): | |
if(self.inLink): | |
print data | |
print "pdf link "+self.pdflink | |
year = "" | |
if(self.year!=""): | |
year="["+self.year+"]" | |
authn=",".join(self.author_list) | |
if(authn!=""): | |
authn="["+authn+"]" | |
self.is_searching=False | |
self.inLink=False | |
if(self.inyear): | |
self.year=data.replace("\n","").replace(" ","") | |
self.inyear=False | |
if(self.inauth): | |
self.author_list.append(data.replace("\n","").replace(" "," ").split()[-1]) | |
self.inauth=False | |
def doDownload(self): | |
year = "" | |
if(self.year!=""): | |
year="["+self.year+"]" | |
authn=",".join(self.author_list) | |
if(authn!=""): | |
authn="["+authn+"]" | |
downloader(self.pdflink,year+authn+self.bookname+".pdf") | |
def find_link(self,attrs): | |
attrnames=zip(*attrs)[0] | |
vals=zip(*attrs)[1] | |
ret=False | |
bookn="" | |
if('doi' in attrnames and ('contenttype', 'Book') in attrs and 'href' in attrnames): | |
for pair_val in attrs: | |
if(pair_val[0]=='href' and pair_val[1].endswith('pdf')): | |
self.pdflink = self.domain+pair_val[1] | |
ret = True | |
if(pair_val[0]=='publication'): | |
bookn=pair_val[1].replace('/','.').split("|")[1]+"."+pair_val[1].replace('/','.').split("|")[0] | |
if(ret): | |
self.bookname=bookn | |
return ret | |
def get_domain(self,url): | |
parsed_uri = urlparse( url ) | |
domain = '{uri.scheme}://{uri.netloc}'.format(uri=parsed_uri) | |
return domain | |
def openurlitem(item): | |
fp=urllib.urlopen(item) | |
data=fp.read().replace('\n', '') | |
return data | |
def process_item(url_link): | |
parser = myhtmlparser(url_link) | |
parser.feed(openurlitem(url_link)) | |
parser.doDownload() | |
class mylisthtmlparser(myhtmlparser): | |
def __init__(self,link): | |
self.reset() | |
self.inLink=False | |
self.inResultList=False | |
self.domain=self.get_domain(link) | |
self.nextlink="" | |
self.pagination=False | |
def handle_starttag(self, tag, attrs): | |
if(self.inResultList): | |
if(tag == 'a' and ('class','title') in attrs): | |
self.inLink=True | |
self.pdflink = self.getHref(attrs) | |
if(tag == 'ol' and ('id','results-list') in attrs): | |
self.inResultList=True | |
if(tag=='form' and ('class','pagination') in attrs): | |
self.pagination = True | |
if(self.pagination and tag == 'a' and ('class','next') in attrs and ('title','next') in attrs): | |
self.nextlink = self.pdflink = self.getHref(attrs) | |
print "next link = "+self.nextlink | |
def handle_endtag(self, tag): | |
if(tag == 'ol' and self.inResultList): | |
self.inResultList=False | |
if(tag == 'form' and self.pagination): | |
self.pagination= False | |
def handle_data(self, data): | |
if(self.inLink): | |
print "Opening "+data | |
print "url link "+self.pdflink | |
try: | |
process_item(self.pdflink) | |
except: | |
print "error : " | |
self.inLink=False | |
def hasNext(self): | |
return self.nextlink.startswith("http") | |
def getHref(self,attrs): | |
for pair_val in attrs: | |
if(pair_val[0]=='href'): | |
return self.domain+pair_val[1] | |
def process_page(url): | |
parser = mylisthtmlparser(url) | |
parser.feed(openurlitem(url)) | |
while(parser.hasNext()): | |
url = parser.nextlink | |
parser = mylisthtmlparser(url) | |
parser.feed(openurlitem(url)) | |
if(len(sys.argv) >= 2): | |
args=sys.argv[1:] | |
if(len(args)==2): | |
if args[0]=="-s": | |
try: | |
process_page(args[1]) | |
except: | |
print "processing error. check your url" | |
print "format: 'python download-books-from-rss-springer.py [-s] url'" | |
else: | |
print "format: 'python download-books-from-rss-springer.py [-s] url'" | |
elif len(args)==1: | |
try: | |
process_item(args[0]) | |
except: | |
print "processing error. check your url" | |
print "format: 'python download-books-from-rss-springer.py [-s] url'" | |
else: | |
print "format: 'python download-books-from-rss-springer.py [-s] url'" | |
print "if argument -s is passed, the script will assume the link is a rss feed with a list of books" | |
print "otherwise, it will assume the link is a url of a single book" | |
else: | |
print "python download-books-from-rss-springer.py [-s] url" | |
# process_page("http://rd.springer.com/search?facet-series=%223214%22&facet-content-type=%22Book%22&showAll=false") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment