s-celles · February 16, 2018 19:27
diff --git a/omercy.py b/omercy.py
 '''
 O'Meirrcy !!!! Download free ebooks from O'Reilly.w

 Usage:
 >pip install requests
 >pip install bs4
 >mkdir omercy
 >cd omercy
 >curl  ... omercy.py
 >python omercy.py

 ... Enjoy :)
 '''

 '''
   DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
                    Version 2, December 2004 

 Copyright (C) 2004 Sam Hocevar <[email protected]> 

 Everyone is permitted to copy and distribute verbatim or modified 
 copies of this license document, and changing it is allowed as long 
 as the name is changed. 

            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE 
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 

  0. You just DO WHAT THE FUCK YOU WANT TO.
 '''

 '''
 # Todo
 * ignore unreachable files
 * update count bookf with file in more than one topic
 * update json desc with multitopis
 * update book json afert a 404
 '''

 import os
 import sys
 import shutil
 import json
 import mimetypes
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse

 source="http://www.oreilly.com/free/reports.html"


 print("-"*50)
 print("Grabbing from", source)
 print("-"*50)
 print()

 try :
    req=requests.get(source)
    req.raise_for_status()
 except:
    print("Unable to open source url:", source)
    sys.exit(1)

 home_s=BeautifulSoup(req.content,"html.parser")

 # Get description
 topics={}
 topics_s=home_s.find_all("section")
 for topic in topics_s:
    topics[topic.attrs["id"]]=topic.find(class_="btn see-more").attrs["href"]
 #Extra "data" section 
 topics["data1"]="http://www.oreilly.com/data/free/archive.html"
 print("Topics:", ", ".join(topics.keys()))


 count_books=0
 unreachables=[]

 for name, url in topics.items():
    print()
    print("-"*50)
    print("Parsing topic:", name)
    print()
    try :
        req=requests.get(url)
        req.raise_for_status()
    except:
        print("Unable to open topic page:", url)
        continue

    topics_s=BeautifulSoup(req.content,"html.parser")


    for book_s in  topics_s.find_all(attrs={"data-toggle": "popover"}):
        # Check if description alraedy exits
        b_url = book_s.attrs["href"]
        print()
        print("Parsing book:", b_url)
        
        filename=urlparse(b_url).path.replace(".csp","").split('/')[-1]
        filepath=filename+"/"+filename+".json"
        print("Checking book information:", filepath)

        if os.path.isfile(filepath):
            print("Book information is already present...")
            count_books=count_books+1
            print ("-->", count_books, "Book description(s)")
        else:
            # Get book description
            print("No existing data => retrieving in file:", filepath)
            book = {}
            book["topic"]=name if name!="data1" else "data"
            book["title"]=book_s.attrs["title"]
            book["description"]=book_s.attrs["data-content"]
            b_url = book_s.attrs["href"]
            # fix http: prefix
            b_url = b_url if b_url.startswith("http:") else "http:"+b_url
            book["source"]=b_url
            c_url=book_s.find("img").attrs["src"]
            c_url = c_url if c_url.startswith("http:") else "http:"+c_url
            book["cover"]=c_url

            #Get extra information (author, author bio, isbn, ...)
            try : 
                req=requests.get(b_url)
                req.raise_for_status()
            except:
                print("Unable to open book extra informations page:", b_url)
                continue
            else:
                ext_book_s=BeautifulSoup(req.content,"html.parser")        
                if not len(req.history): # not redirected
                    author=ext_book_s.find("h3", class_="author-name")
                    if author:
                        book["author"]= author.getText()
                    author_bio=ext_book_s.find("div", class_="highlight")
                    if author_bio:
                        book["author_bio"]=author_bio.find("p").getText()
                else: # redirect on Safari ?
                        author=ext_book_s.find("div", class_="t-authors")
                        if author:
                            book["author"] = author.getText().split("by ")[1]
                        isbn = ext_book_s.find("div", class_="t-isbn")
                        if isbn:
                            book["isbn"] = isbn.get_text().split("ISBN: ")[1]
                #print("----unable to get author---")

            # outside navigator download links are not displayed
            # get formats available only
            d_url=book["source"].split("?")[0]+"?download=true"
            try : 
                req=requests.get(d_url)
                req.raise_for_status()
            except:
                print("Unable to open book download page", d_url)

            # curious case
            if book["source"]=="http://www.oreilly.com/data/free/business-models-for-the-data-economy.csp?intcmp=il-data-free-lp-lgen_free_reports_page":
                filebase="http://www.oreilly.com/business/free/files/critical-first-10-days-as-leader"
            # without topics
            elif book["source"].startswith("http://www.oreilly.com/free"):
                t=book["source"].split("?topic=")[-1]
                filebase="http://www.oreilly.com/"+t+"/free/files/"+filename            
            # normal case
            #filebase="http://www.oreilly.com/"+book["topic"]+"/free/files/"+filename
            else:
                filebase=book["source"].split(filename)[0]+"files/"+filename

            download_s=BeautifulSoup(req.content,"html.parser")
            c_formats=0        
            if(download_s.find("a", class_="btn pdf")):
                book["pdf"]=filebase+".pdf"
                c_formats=c_formats+1         
            if(download_s.find("a", class_="btn epub")):
                book["epub"]=filebase+".epub"
                c_formats=c_formats+1
            if(download_s.find("a", class_="btn mobi")):
                book["mobi"]=filebase+".mobi"
                c_formats=c_formats+1
            if not c_formats: # redirect on safari, try all
                print("No format directly available", d_url)
                book["pdf"]=filebase+".pdf"
                book["epub"]=filebase+".epub"
                book["mobi"]=filebase+".mobi"

            # persist json data
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'w') as fd:
                json.dump(book, fd)
            print ("Book description(s) retrieved:", b_url)
                
            count_books=count_books+1
            print ("-->", count_books, "Book description(s)")

 '''
    #For debug purpose
    if count_books>=5:
        break

 sys.exit(0)
 '''

 # Get covers
 print()
 print("-"*50)
 print("Retrieving covers")

 for path, dirs, files in os.walk("."):    
    for dir in dirs:
        print()
        filebase=path+"/"+dir+"/"+dir
        print(filebase)
        jsonfile=filebase+".json"
        try:        
            book=json.load(open(jsonfile))
        except:
            print("Unable to get information:",jsonfile)
            continue
        
        cover_u=book["cover"]
        print("Retrieving", cover_u)
        
        try:
            req=requests.get(cover_u)
            req.raise_for_status()
        except :
            print("Unable to retrieve cover:",cover_u)
        else:
            content_type = req.headers['content-type']
            ext = mimetypes.guess_extension(content_type)
            filename=path+"/"+dir+"/cover"+ext 
            if os.path.isfile(filename):
                print(filename, "already exists...")
            else:     
                with open(filename+".tmp", 'wb') as fd:
                    fd.write(req.content)
                shutil.move(filename+".tmp", filename)
                print(filename, "retrieved...")




 count_files=0

 # Get books
 extensions=["epub", "pdf", "mobi" ]
 #extensions=["pdf",]
 for ext in extensions:
    print()
    print("-"*50)
    print("Retrieving books:", ext)

    for path, dirs, files in os.walk("."):    
        for dir in dirs:
            print()
            filebase=path+"/"+dir+"/"+dir
            print(filebase)        

            jsonfile=filebase+".json"
            try:        
                book=json.load(open(jsonfile))
            except:
                print("Unable to get information:",jsonfile)
                continue
            
            filename=filebase+"."+ext
            if os.path.isfile(filename):
                print(filename, "already exists...")
                count_files=count_files+1
                print ("-->", count_files, "file(s)")

            else:
                if not ext in book:
                    continue
                book_u=book[ext]           
                print("Retrieving", book_u)
                # asssuming good centent-type
                try:
                    req=requests.get(book_u)
                    req.raise_for_status()
                except requests.exceptions.HTTPError as err:
                    print("Unable to retrieve file:",book_u)
                    print("source=",book["source"])
                    print ("Http Error:",err)
                except :
                    print("Unexpected error:",book_u)
                else:
                    with open(filename+".tmp", 'wb') as fd:
                        fd.write(req.content)
                    shutil.move(filename+".tmp", filename)
                    print(filename, "retrieved...")

                    count_files=count_files+1
                    print ("-->", count_files, "file(s)")

 print()
 print ("--------------> Total books count:", count_books)
 print ("--------------> Total files count:", count_files)
	'''
	O'Meirrcy !!!! Download free ebooks from O'Reilly.w

	Usage:
	>pip install requests
	>pip install bs4
	>mkdir omercy
	>cd omercy
	>curl ... omercy.py
	>python omercy.py

	... Enjoy :)
	'''

	'''
	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	Version 2, December 2004

	Copyright (C) 2004 Sam Hocevar <[email protected]>

	Everyone is permitted to copy and distribute verbatim or modified
	copies of this license document, and changing it is allowed as long
	as the name is changed.

	DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	0. You just DO WHAT THE FUCK YOU WANT TO.
	'''

	'''
	# Todo
	* ignore unreachable files
	* update count bookf with file in more than one topic
	* update json desc with multitopis
	* update book json afert a 404
	'''

	import os
	import sys
	import shutil
	import json
	import mimetypes
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse

	source="http://www.oreilly.com/free/reports.html"


	print("-"*50)
	print("Grabbing from", source)
	print("-"*50)
	print()

	try :
	req=requests.get(source)
	req.raise_for_status()
	except:
	print("Unable to open source url:", source)
	sys.exit(1)

	home_s=BeautifulSoup(req.content,"html.parser")

	# Get description
	topics={}
	topics_s=home_s.find_all("section")
	for topic in topics_s:
	topics[topic.attrs["id"]]=topic.find(class_="btn see-more").attrs["href"]
	#Extra "data" section
	topics["data1"]="http://www.oreilly.com/data/free/archive.html"
	print("Topics:", ", ".join(topics.keys()))


	count_books=0
	unreachables=[]

	for name, url in topics.items():
	print()
	print("-"*50)
	print("Parsing topic:", name)
	print()
	try :
	req=requests.get(url)
	req.raise_for_status()
	except:
	print("Unable to open topic page:", url)
	continue

	topics_s=BeautifulSoup(req.content,"html.parser")


	for book_s in topics_s.find_all(attrs={"data-toggle": "popover"}):
	# Check if description alraedy exits
	b_url = book_s.attrs["href"]
	print()
	print("Parsing book:", b_url)

	filename=urlparse(b_url).path.replace(".csp","").split('/')[-1]
	filepath=filename+"/"+filename+".json"
	print("Checking book information:", filepath)

	if os.path.isfile(filepath):
	print("Book information is already present...")
	count_books=count_books+1
	print ("-->", count_books, "Book description(s)")
	else:
	# Get book description
	print("No existing data => retrieving in file:", filepath)
	book = {}
	book["topic"]=name if name!="data1" else "data"
	book["title"]=book_s.attrs["title"]
	book["description"]=book_s.attrs["data-content"]
	b_url = book_s.attrs["href"]
	# fix http: prefix
	b_url = b_url if b_url.startswith("http:") else "http:"+b_url
	book["source"]=b_url
	c_url=book_s.find("img").attrs["src"]
	c_url = c_url if c_url.startswith("http:") else "http:"+c_url
	book["cover"]=c_url

	#Get extra information (author, author bio, isbn, ...)
	try :
	req=requests.get(b_url)
	req.raise_for_status()
	except:
	print("Unable to open book extra informations page:", b_url)
	continue
	else:
	ext_book_s=BeautifulSoup(req.content,"html.parser")
	if not len(req.history): # not redirected
	author=ext_book_s.find("h3", class_="author-name")
	if author:
	book["author"]= author.getText()
	author_bio=ext_book_s.find("div", class_="highlight")
	if author_bio:
	book["author_bio"]=author_bio.find("p").getText()
	else: # redirect on Safari ?
	author=ext_book_s.find("div", class_="t-authors")
	if author:
	book["author"] = author.getText().split("by ")[1]
	isbn = ext_book_s.find("div", class_="t-isbn")
	if isbn:
	book["isbn"] = isbn.get_text().split("ISBN: ")[1]
	#print("----unable to get author---")

	# outside navigator download links are not displayed
	# get formats available only
	d_url=book["source"].split("?")[0]+"?download=true"
	try :
	req=requests.get(d_url)
	req.raise_for_status()
	except:
	print("Unable to open book download page", d_url)

	# curious case
	if book["source"]=="http://www.oreilly.com/data/free/business-models-for-the-data-economy.csp?intcmp=il-data-free-lp-lgen_free_reports_page":
	filebase="http://www.oreilly.com/business/free/files/critical-first-10-days-as-leader"
	# without topics
	elif book["source"].startswith("http://www.oreilly.com/free"):
	t=book["source"].split("?topic=")[-1]
	filebase="http://www.oreilly.com/"+t+"/free/files/"+filename
	# normal case
	#filebase="http://www.oreilly.com/"+book["topic"]+"/free/files/"+filename
	else:
	filebase=book["source"].split(filename)[0]+"files/"+filename

	download_s=BeautifulSoup(req.content,"html.parser")
	c_formats=0
	if(download_s.find("a", class_="btn pdf")):
	book["pdf"]=filebase+".pdf"
	c_formats=c_formats+1
	if(download_s.find("a", class_="btn epub")):
	book["epub"]=filebase+".epub"
	c_formats=c_formats+1
	if(download_s.find("a", class_="btn mobi")):
	book["mobi"]=filebase+".mobi"
	c_formats=c_formats+1
	if not c_formats: # redirect on safari, try all
	print("No format directly available", d_url)
	book["pdf"]=filebase+".pdf"
	book["epub"]=filebase+".epub"
	book["mobi"]=filebase+".mobi"

	# persist json data
	os.makedirs(os.path.dirname(filepath), exist_ok=True)
	with open(filepath, 'w') as fd:
	json.dump(book, fd)
	print ("Book description(s) retrieved:", b_url)

	count_books=count_books+1
	print ("-->", count_books, "Book description(s)")

	'''
	#For debug purpose
	if count_books>=5:
	break

	sys.exit(0)
	'''

	# Get covers
	print()
	print("-"*50)
	print("Retrieving covers")

	for path, dirs, files in os.walk("."):
	for dir in dirs:
	print()
	filebase=path+"/"+dir+"/"+dir
	print(filebase)
	jsonfile=filebase+".json"
	try:
	book=json.load(open(jsonfile))
	except:
	print("Unable to get information:",jsonfile)
	continue

	cover_u=book["cover"]
	print("Retrieving", cover_u)

	try:
	req=requests.get(cover_u)
	req.raise_for_status()
	except :
	print("Unable to retrieve cover:",cover_u)
	else:
	content_type = req.headers['content-type']
	ext = mimetypes.guess_extension(content_type)
	filename=path+"/"+dir+"/cover"+ext
	if os.path.isfile(filename):
	print(filename, "already exists...")
	else:
	with open(filename+".tmp", 'wb') as fd:
	fd.write(req.content)
	shutil.move(filename+".tmp", filename)
	print(filename, "retrieved...")




	count_files=0

	# Get books
	extensions=["epub", "pdf", "mobi" ]
	#extensions=["pdf",]
	for ext in extensions:
	print()
	print("-"*50)
	print("Retrieving books:", ext)

	for path, dirs, files in os.walk("."):
	for dir in dirs:
	print()
	filebase=path+"/"+dir+"/"+dir
	print(filebase)

	jsonfile=filebase+".json"
	try:
	book=json.load(open(jsonfile))
	except:
	print("Unable to get information:",jsonfile)
	continue

	filename=filebase+"."+ext
	if os.path.isfile(filename):
	print(filename, "already exists...")
	count_files=count_files+1
	print ("-->", count_files, "file(s)")

	else:
	if not ext in book:
	continue
	book_u=book[ext]
	print("Retrieving", book_u)
	# asssuming good centent-type
	try:
	req=requests.get(book_u)
	req.raise_for_status()
	except requests.exceptions.HTTPError as err:
	print("Unable to retrieve file:",book_u)
	print("source=",book["source"])
	print ("Http Error:",err)
	except :
	print("Unexpected error:",book_u)
	else:
	with open(filename+".tmp", 'wb') as fd:
	fd.write(req.content)
	shutil.move(filename+".tmp", filename)
	print(filename, "retrieved...")

	count_files=count_files+1
	print ("-->", count_files, "file(s)")

	print()
	print ("--------------> Total books count:", count_books)
	print ("--------------> Total files count:", count_files)
No results found