Skip to content

Instantly share code, notes, and snippets.

@s-celles
Forked from Krazybug/omercy.py
Created February 16, 2018 19:27
Show Gist options
  • Save s-celles/5fc86b95ef156ab7e995583d858bf725 to your computer and use it in GitHub Desktop.
Save s-celles/5fc86b95ef156ab7e995583d858bf725 to your computer and use it in GitHub Desktop.
O'Reilly free ebooks downloader
'''
O'Meirrcy !!!! Download free ebooks from O'Reilly.w
Usage:
>pip install requests
>pip install bs4
>mkdir omercy
>cd omercy
>curl ... omercy.py
>python omercy.py
... Enjoy :)
'''
'''
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <[email protected]>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.
'''
'''
# Todo
* ignore unreachable files
* update count bookf with file in more than one topic
* update json desc with multitopis
* update book json afert a 404
'''
import os
import sys
import shutil
import json
import mimetypes
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
source="http://www.oreilly.com/free/reports.html"
print("-"*50)
print("Grabbing from", source)
print("-"*50)
print()
try :
req=requests.get(source)
req.raise_for_status()
except:
print("Unable to open source url:", source)
sys.exit(1)
home_s=BeautifulSoup(req.content,"html.parser")
# Get description
topics={}
topics_s=home_s.find_all("section")
for topic in topics_s:
topics[topic.attrs["id"]]=topic.find(class_="btn see-more").attrs["href"]
#Extra "data" section
topics["data1"]="http://www.oreilly.com/data/free/archive.html"
print("Topics:", ", ".join(topics.keys()))
count_books=0
unreachables=[]
for name, url in topics.items():
print()
print("-"*50)
print("Parsing topic:", name)
print()
try :
req=requests.get(url)
req.raise_for_status()
except:
print("Unable to open topic page:", url)
continue
topics_s=BeautifulSoup(req.content,"html.parser")
for book_s in topics_s.find_all(attrs={"data-toggle": "popover"}):
# Check if description alraedy exits
b_url = book_s.attrs["href"]
print()
print("Parsing book:", b_url)
filename=urlparse(b_url).path.replace(".csp","").split('/')[-1]
filepath=filename+"/"+filename+".json"
print("Checking book information:", filepath)
if os.path.isfile(filepath):
print("Book information is already present...")
count_books=count_books+1
print ("-->", count_books, "Book description(s)")
else:
# Get book description
print("No existing data => retrieving in file:", filepath)
book = {}
book["topic"]=name if name!="data1" else "data"
book["title"]=book_s.attrs["title"]
book["description"]=book_s.attrs["data-content"]
b_url = book_s.attrs["href"]
# fix http: prefix
b_url = b_url if b_url.startswith("http:") else "http:"+b_url
book["source"]=b_url
c_url=book_s.find("img").attrs["src"]
c_url = c_url if c_url.startswith("http:") else "http:"+c_url
book["cover"]=c_url
#Get extra information (author, author bio, isbn, ...)
try :
req=requests.get(b_url)
req.raise_for_status()
except:
print("Unable to open book extra informations page:", b_url)
continue
else:
ext_book_s=BeautifulSoup(req.content,"html.parser")
if not len(req.history): # not redirected
author=ext_book_s.find("h3", class_="author-name")
if author:
book["author"]= author.getText()
author_bio=ext_book_s.find("div", class_="highlight")
if author_bio:
book["author_bio"]=author_bio.find("p").getText()
else: # redirect on Safari ?
author=ext_book_s.find("div", class_="t-authors")
if author:
book["author"] = author.getText().split("by ")[1]
isbn = ext_book_s.find("div", class_="t-isbn")
if isbn:
book["isbn"] = isbn.get_text().split("ISBN: ")[1]
#print("----unable to get author---")
# outside navigator download links are not displayed
# get formats available only
d_url=book["source"].split("?")[0]+"?download=true"
try :
req=requests.get(d_url)
req.raise_for_status()
except:
print("Unable to open book download page", d_url)
# curious case
if book["source"]=="http://www.oreilly.com/data/free/business-models-for-the-data-economy.csp?intcmp=il-data-free-lp-lgen_free_reports_page":
filebase="http://www.oreilly.com/business/free/files/critical-first-10-days-as-leader"
# without topics
elif book["source"].startswith("http://www.oreilly.com/free"):
t=book["source"].split("?topic=")[-1]
filebase="http://www.oreilly.com/"+t+"/free/files/"+filename
# normal case
#filebase="http://www.oreilly.com/"+book["topic"]+"/free/files/"+filename
else:
filebase=book["source"].split(filename)[0]+"files/"+filename
download_s=BeautifulSoup(req.content,"html.parser")
c_formats=0
if(download_s.find("a", class_="btn pdf")):
book["pdf"]=filebase+".pdf"
c_formats=c_formats+1
if(download_s.find("a", class_="btn epub")):
book["epub"]=filebase+".epub"
c_formats=c_formats+1
if(download_s.find("a", class_="btn mobi")):
book["mobi"]=filebase+".mobi"
c_formats=c_formats+1
if not c_formats: # redirect on safari, try all
print("No format directly available", d_url)
book["pdf"]=filebase+".pdf"
book["epub"]=filebase+".epub"
book["mobi"]=filebase+".mobi"
# persist json data
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, 'w') as fd:
json.dump(book, fd)
print ("Book description(s) retrieved:", b_url)
count_books=count_books+1
print ("-->", count_books, "Book description(s)")
'''
#For debug purpose
if count_books>=5:
break
sys.exit(0)
'''
# Get covers
print()
print("-"*50)
print("Retrieving covers")
for path, dirs, files in os.walk("."):
for dir in dirs:
print()
filebase=path+"/"+dir+"/"+dir
print(filebase)
jsonfile=filebase+".json"
try:
book=json.load(open(jsonfile))
except:
print("Unable to get information:",jsonfile)
continue
cover_u=book["cover"]
print("Retrieving", cover_u)
try:
req=requests.get(cover_u)
req.raise_for_status()
except :
print("Unable to retrieve cover:",cover_u)
else:
content_type = req.headers['content-type']
ext = mimetypes.guess_extension(content_type)
filename=path+"/"+dir+"/cover"+ext
if os.path.isfile(filename):
print(filename, "already exists...")
else:
with open(filename+".tmp", 'wb') as fd:
fd.write(req.content)
shutil.move(filename+".tmp", filename)
print(filename, "retrieved...")
count_files=0
# Get books
extensions=["epub", "pdf", "mobi" ]
#extensions=["pdf",]
for ext in extensions:
print()
print("-"*50)
print("Retrieving books:", ext)
for path, dirs, files in os.walk("."):
for dir in dirs:
print()
filebase=path+"/"+dir+"/"+dir
print(filebase)
jsonfile=filebase+".json"
try:
book=json.load(open(jsonfile))
except:
print("Unable to get information:",jsonfile)
continue
filename=filebase+"."+ext
if os.path.isfile(filename):
print(filename, "already exists...")
count_files=count_files+1
print ("-->", count_files, "file(s)")
else:
if not ext in book:
continue
book_u=book[ext]
print("Retrieving", book_u)
# asssuming good centent-type
try:
req=requests.get(book_u)
req.raise_for_status()
except requests.exceptions.HTTPError as err:
print("Unable to retrieve file:",book_u)
print("source=",book["source"])
print ("Http Error:",err)
except :
print("Unexpected error:",book_u)
else:
with open(filename+".tmp", 'wb') as fd:
fd.write(req.content)
shutil.move(filename+".tmp", filename)
print(filename, "retrieved...")
count_files=count_files+1
print ("-->", count_files, "file(s)")
print()
print ("--------------> Total books count:", count_books)
print ("--------------> Total files count:", count_files)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment