|
#!/usr/bin/env python3 |
|
# |
|
# Matthew Rothfuss |
|
# First: 10/13/2016 |
|
# Edit: 03/07/2018 |
|
# |
|
# Purpose: |
|
# Download all free ebooks (epub, mobi, pdf) from oreilly.com |
|
# |
|
# Overview: |
|
# Runs in parallel threads, based on cpu count * 2 (i.e. 8 cpus * 2 = 16 threads) |
|
# Change "processes" to desired threads if cpu count causes issues |
|
# Can specify custom "categories" and "etypes" |
|
# Set "verbose" to True, to print all output |
|
# |
|
# Original: |
|
# https://www.reddit.com/r/Python/comments/56syaa/7_oreilly_python_books_for_free/d8n6597 |
|
|
|
from argparse import ArgumentParser as AP |
|
from lxml import html as HTML |
|
import multiprocessing |
|
import argcomplete |
|
import requests |
|
import re |
|
import os |
|
|
|
#################################################################################################### |
|
class OReillyEbookFreeScraper(object): |
|
def __init__(self, categories=None, etypes=None, processes=None, outdir=None, dryrun=False, verbose=0): |
|
self.base_url = 'http://www.oreilly.com' |
|
self.dryrun = dryrun |
|
self.verbose = verbose > 0 |
|
self.outdir = outdir |
|
self.categories = list() |
|
if categories: |
|
self.categories = categories if isinstance(categories, list) else [categories] |
|
self.ebook_types = ['.pdf', '.mobi', '.epub'] |
|
if etypes: |
|
self.ebook_types = etypes if isinstance(etypes, list) else [etypes] |
|
if set([ ebt for ebt in self.ebook_types if not ebt.startswith('.')]): |
|
self.ebook_types = [('.'+t if not t.startswith('.') else t) for t in self.ebook_types] |
|
self.processes = processes if processes else multiprocessing.cpu_count()*2 |
|
|
|
|
|
self.verify_outdir() # setup output directory |
|
if self.dryrun: |
|
print("* *") |
|
print("******************************* Dry-Run Mode *********************************") |
|
print("* *") |
|
print("\nSaving all free ebooks ({}) from oreilly.com".format(', '.join(self.ebook_types))) |
|
print("*" * 80) |
|
self.get_all_free_ebook_urls() |
|
print("Done") |
|
|
|
def is_same_file_size(self, filepath, url): |
|
"""Check file size against URL ebook content size""" |
|
|
|
# Read the length of a file from the file header it is in Bytes |
|
r = requests.head(url) |
|
remote_size = float(r.headers['Content-Length']) |
|
|
|
# Local File size in bytes and 0 if it does not exist |
|
try: local_size = float(os.path.getsize(filepath)) |
|
except: local_size = 0.0 |
|
|
|
return remote_size == local_size |
|
|
|
def ensure_dir(self, path): |
|
try: |
|
os.makedirs(path, exist_ok=True) |
|
except TypeError as e: |
|
# python 2.x fix |
|
if os.path.exists(path): |
|
if os.path.isfile(path): |
|
raise Exception("Error: Directory path already exists as a file. {}".format(path)) |
|
else: |
|
os.makedirs(path) |
|
|
|
def verify_outdir(self): |
|
if not self.dryrun: |
|
self.ensure_dir(self.outdir) |
|
|
|
def download(self, url): |
|
""" |
|
Download ebook |
|
Ensures download directory, based on category and book title |
|
Will download updated versions of ebooks if exists |
|
checks file size against URL ebook content size |
|
""" |
|
|
|
dl_dir = url.split('/', 4)[3] |
|
# http://stackoverflow.com/a/16696317 |
|
filename = url.split('/')[-1] |
|
file_dir = "".join(filename.split('.')[:-1]) |
|
filepath = os.path.abspath(self.outdir+"/{}/{}/{}".format(dl_dir, file_dir, filename)) |
|
|
|
# Test if file already exists |
|
if os.path.isfile(filepath): |
|
# Test file size matching |
|
if self.is_same_file_size(filepath, url): |
|
if self.verbose: |
|
print("SKIP: Existing {}".format(filepath)) |
|
return |
|
elif self.verbose: |
|
print("NEW: {}".format(url)) |
|
|
|
# skip download for dryrun |
|
if self.dryrun: |
|
print("[dryrun]: {} --> {}".format(filename, filepath)) |
|
return |
|
|
|
# download/save ebook |
|
try: |
|
r = requests.get(url, stream=True) # NOTE the stream=True parameter |
|
r.raise_for_status() # raise error if status gives error |
|
self.ensure_dir(os.path.abspath(self.outdir+"/{}/{}".format(dl_dir, file_dir))) # ensure download directory |
|
with open(filepath, 'wb') as f: |
|
for chunk in r.iter_content(chunk_size=1024): |
|
if chunk: # filter out keep-alive new chunks |
|
f.write(chunk) |
|
print("SAVED: {}".format(url)) |
|
return filename |
|
except requests.exceptions.HTTPError as e: |
|
if (e.response.status_code == 404): |
|
if self.verbose: |
|
print(u"WARN: {} file does NOT exists for '{}'".format(url.split('.')[-1], url)) |
|
else: |
|
print(u"ERROR: <HTTP {}> Cannot handle '{}'".format(e.response.status_code, url)) |
|
except Exception as e: |
|
print(u"ERROR: Cannot handle '{}' >>> ({})".format(url, e)) |
|
return |
|
|
|
def expand_url(self, url): |
|
try: |
|
r = requests.head(url) |
|
r.raise_for_status() |
|
rurl = r.url |
|
if 'location' in r.headers: |
|
rurl = r.headers['location'] |
|
|
|
if rurl.endswith('.do'): |
|
return |
|
elif rurl.split('/', 5)[4] != 'free': |
|
return |
|
|
|
return rurl |
|
except: |
|
return |
|
|
|
def pa(self, cat): |
|
if cat == "ai": |
|
return "data" |
|
elif cat == "software-engineering": |
|
return "programming" |
|
elif cat == "operations": |
|
return "webops-perf" |
|
elif cat == "web-programming": |
|
return "web-platform" |
|
else: |
|
return cat |
|
|
|
def category_from_url(self, url): |
|
surl = url.split('/', 5) if url else '' |
|
return surl[3] if surl else '' |
|
|
|
def get_dl_links(self, url): |
|
try: |
|
req = requests.get(url) |
|
req.raise_for_status() |
|
except Exception as e: |
|
if self.verbose: |
|
print(u"WARN: Cannot handle '{}' >>> ({})".format(url, e)) |
|
return list() |
|
|
|
try: |
|
html = HTML.fromstring(req.text) |
|
except Exception as e: |
|
if self.verbose: |
|
print(u"WARN: Cannot find HTML in '{}' >>> ({})".format(url, e)) |
|
return list() |
|
|
|
post_vals = dict() |
|
for item in html.xpath('//form[@method="post"]'): |
|
keys = item.xpath('./input/@name') |
|
values = item.xpath('./input/@value') |
|
post_vals = dict(zip(keys, values)) |
|
|
|
post_vals.update( |
|
{ |
|
'first': 'a', 'last': 'b', 'email': '[email protected]', |
|
'newsletter': 'nl_webops_perf', 'x-a': 'Get Your Free Ebook', |
|
'x-redirect': url+"?download=true" |
|
} |
|
) |
|
|
|
purl = self.base_url+"/cs/user/create/download_requests" |
|
try: |
|
req2 = requests.post(purl, data=post_vals) |
|
req2.raise_for_status() |
|
except Exception as e: |
|
if self.verbose: |
|
print(u"WARN: Cannot handle '{}' >>> ({})".format(purl, e)) |
|
return list() |
|
|
|
r = re.search("var\s*?shortLink\s*?\=\s*?[\'\"]([^\'\"]+)[\'\"]", req2.text) |
|
if not r: |
|
return list() |
|
|
|
slink = r.group(1).strip() |
|
|
|
dlurl = self.base_url + '/' + self.pa(self.category_from_url(url)) + '/free/files/' + slink |
|
return [ dlurl+t for t in self.ebook_types ] |
|
|
|
def get_all_free_ebook_urls(self): |
|
"""Get all free ebook urls""" |
|
|
|
furl = self.base_url+'/free/' |
|
try: |
|
req = requests.get(furl) |
|
req.raise_for_status() |
|
except Exception as e: |
|
print(u"ERROR: Cannot handle '{}' >>> ({})".format(furl, e)) |
|
return list() |
|
|
|
html = HTML.fromstring(req.text) |
|
eurls = html.xpath('//div[@name="FreeEbooks"]//a[@class="item-title"]/@href') |
|
|
|
pool = multiprocessing.Pool(processes=self.processes) |
|
|
|
urls = pool.map(self.expand_url, eurls) |
|
urls = list(sorted([u for u in set(urls) if u])) |
|
|
|
if self.categories: |
|
urls = list(set([u for u in urls if self.category_from_url(u) in self.categories])) |
|
categories = self.categories |
|
else: |
|
categories = pool.map(self.category_from_url, urls) |
|
categories = list(sorted([c for c in set(categories) if c])) # remove duplicates |
|
|
|
print("Downloading available free ebooks from the following categorie(s):\n{}".format(', '.join(categories))) |
|
print("*" * 80) |
|
print("Saving to the following directory:\n{}".format(os.path.abspath(self.outdir))) |
|
print("*" * 80) |
|
|
|
dl_urls = pool.map(self.get_dl_links, urls) |
|
dl_flat_list = [item for sublist in dl_urls for item in sublist] |
|
pool.map(self.download, dl_flat_list) |
|
|
|
pool.close() # no more tasks |
|
pool.join() # wrap up current tasks |
|
|
|
if self.dryrun: |
|
print("\nINFO: {} possible ebooks in '{}' categorie(s).".format(int(len(dl_flat_list)/3), ', '.join(categories))) |
|
|
|
return |
|
|
|
#################################################################################################### |
|
if __name__ == "__main__": |
|
""" |
|
Main Program |
|
Examples: |
|
1 : oreillyurls_2.py -p 4 |
|
2 : oreillyurls_2.py -c 'data' |
|
3 : oreillyurls_2.py -c ['programming', 'data'] -o ~/Downloads/ebooks-tmp |
|
""" |
|
|
|
parser = AP(description='Download all free ebooks (epub, mobi, pdf) from oreilly.com') |
|
parser.add_argument('-c', '--categories', dest='categories', help='Specify custom "categories". Can be single or list.') |
|
parser.add_argument('-t', '--types', dest='etypes', help='Specify custom "ebook types". Can be single or list.') |
|
parser.add_argument('-p', '--processes', dest='processes', help='Specify custom processes count. Used in multithreading.') |
|
parser.add_argument('-o', '--out', dest='outdir', default='ebooks-oreilly', help='Output directory') |
|
parser.add_argument('-d', '--dryrun', dest='dryrun', default=False, action='store_true', help='Do a dry run without downloading.') |
|
parser.add_argument('-v', '--verbose', dest='verbose', default=0, action="count", help='Enable/Disable verbose logging.') |
|
argcomplete.autocomplete(parser) |
|
args = parser.parse_args() |
|
|
|
OReillyEbookFreeScraper(**vars(args)) |