#!/usr/bin/env python3 |
# |
# Matthew Rothfuss |
# First: 10/13/2016 |
# Edit: 03/07/2018 |
# |
# Purpose: |
# Download all free ebooks (epub, mobi, pdf) from oreilly.com |
# |
# Overview: |
# Runs in parallel threads, based on cpu count * 2 (i.e. 8 cpus * 2 = 16 threads) |
# Change "processes" to desired threads if cpu count causes issues |
# Can specify custom "categories" and "etypes" |
# Set "verbose" to True, to print all output |
# |
# Original: |
# https://www.reddit.com/r/Python/comments/56syaa/7_oreilly_python_books_for_free/d8n6597 |
from argparse import ArgumentParser as AP |
from lxml import html as HTML |
import multiprocessing |
import argcomplete |
import requests |
import re |
import os |
#################################################################################################### |
class OReillyEbookFreeScraper(object): |
def __init__(self, categories=None, etypes=None, processes=None, outdir=None, dryrun=False, verbose=0): |
self.base_url = 'http://www.oreilly.com' |
self.dryrun = dryrun |
self.verbose = verbose > 0 |
self.outdir = outdir |
self.categories = list() |
if categories: |
self.categories = categories if isinstance(categories, list) else [categories] |
self.ebook_types = ['.pdf', '.mobi', '.epub'] |
if etypes: |
self.ebook_types = etypes if isinstance(etypes, list) else [etypes] |
if set([ ebt for ebt in self.ebook_types if not ebt.startswith('.')]): |
self.ebook_types = [('.'+t if not t.startswith('.') else t) for t in self.ebook_types] |
self.processes = processes if processes else multiprocessing.cpu_count()*2 |
self.verify_outdir() # setup output directory |
if self.dryrun: |
print("* *") |
print("******************************* Dry-Run Mode *********************************") |
print("* *") |
print("\nSaving all free ebooks ({}) from oreilly.com".format(', '.join(self.ebook_types))) |
print("*" * 80) |
self.get_all_free_ebook_urls() |
print("Done") |
def is_same_file_size(self, filepath, url): |
"""Check file size against URL ebook content size""" |
# Read the length of a file from the file header it is in Bytes |
r = requests.head(url) |
remote_size = float(r.headers['Content-Length']) |
# Local File size in bytes and 0 if it does not exist |
try: local_size = float(os.path.getsize(filepath)) |
except: local_size = 0.0 |
return remote_size == local_size |
def ensure_dir(self, path): |
try: |
os.makedirs(path, exist_ok=True) |
except TypeError as e: |
# python 2.x fix |
if os.path.exists(path): |
if os.path.isfile(path): |
raise Exception("Error: Directory path already exists as a file. {}".format(path)) |
else: |
os.makedirs(path) |
def verify_outdir(self): |
if not self.dryrun: |
self.ensure_dir(self.outdir) |
def download(self, url): |
""" |
Download ebook |
Ensures download directory, based on category and book title |
Will download updated versions of ebooks if exists |
checks file size against URL ebook content size |
""" |
dl_dir = url.split('/', 4)[3] |
# http://stackoverflow.com/a/16696317 |
filename = url.split('/')[-1] |
file_dir = "".join(filename.split('.')[:-1]) |
filepath = os.path.abspath(self.outdir+"/{}/{}/{}".format(dl_dir, file_dir, filename)) |
# Test if file already exists |
if os.path.isfile(filepath): |
# Test file size matching |
if self.is_same_file_size(filepath, url): |
if self.verbose: |
print("SKIP: Existing {}".format(filepath)) |
return |
elif self.verbose: |
print("NEW: {}".format(url)) |
# skip download for dryrun |
if self.dryrun: |
print("[dryrun]: {} --> {}".format(filename, filepath)) |
return |
# download/save ebook |
try: |
r = requests.get(url, stream=True) # NOTE the stream=True parameter |
r.raise_for_status() # raise error if status gives error |
self.ensure_dir(os.path.abspath(self.outdir+"/{}/{}".format(dl_dir, file_dir))) # ensure download directory |
with open(filepath, 'wb') as f: |
for chunk in r.iter_content(chunk_size=1024): |
if chunk: # filter out keep-alive new chunks |
f.write(chunk) |
print("SAVED: {}".format(url)) |
return filename |
except requests.exceptions.HTTPError as e: |
if (e.response.status_code == 404): |
if self.verbose: |
print(u"WARN: {} file does NOT exists for '{}'".format(url.split('.')[-1], url)) |
else: |
print(u"ERROR: <HTTP {}> Cannot handle '{}'".format(e.response.status_code, url)) |
except Exception as e: |
print(u"ERROR: Cannot handle '{}' >>> ({})".format(url, e)) |
return |
def expand_url(self, url): |
try: |
r = requests.head(url) |
r.raise_for_status() |
rurl = r.url |
if 'location' in r.headers: |
rurl = r.headers['location'] |
if rurl.endswith('.do'): |
return |
elif rurl.split('/', 5)[4] != 'free': |
return |
return rurl |
except: |
return |
def pa(self, cat): |
if cat == "ai": |
return "data" |
elif cat == "software-engineering": |
return "programming" |
elif cat == "operations": |
return "webops-perf" |
elif cat == "web-programming": |
return "web-platform" |
else: |
return cat |
def category_from_url(self, url): |
surl = url.split('/', 5) if url else '' |
return surl[3] if surl else '' |
def get_dl_links(self, url): |
try: |
req = requests.get(url) |
req.raise_for_status() |
except Exception as e: |
if self.verbose: |
print(u"WARN: Cannot handle '{}' >>> ({})".format(url, e)) |
return list() |
try: |
html = HTML.fromstring(req.text) |
except Exception as e: |
if self.verbose: |
print(u"WARN: Cannot find HTML in '{}' >>> ({})".format(url, e)) |
return list() |
post_vals = dict() |
for item in html.xpath('//form[@method="post"]'): |
keys = item.xpath('./input/@name') |
values = item.xpath('./input/@value') |
post_vals = dict(zip(keys, values)) |
post_vals.update( |
{ |
'first': 'a', 'last': 'b', 'email': '[email protected]', |
'newsletter': 'nl_webops_perf', 'x-a': 'Get Your Free Ebook', |
'x-redirect': url+"?download=true" |
} |
) |
purl = self.base_url+"/cs/user/create/download_requests" |
try: |
req2 = requests.post(purl, data=post_vals) |
req2.raise_for_status() |
except Exception as e: |
if self.verbose: |
print(u"WARN: Cannot handle '{}' >>> ({})".format(purl, e)) |
return list() |
r = re.search("var\s*?shortLink\s*?\=\s*?[\'\"]([^\'\"]+)[\'\"]", req2.text) |
if not r: |
return list() |
slink = r.group(1).strip() |
dlurl = self.base_url + '/' + self.pa(self.category_from_url(url)) + '/free/files/' + slink |
return [ dlurl+t for t in self.ebook_types ] |
def get_all_free_ebook_urls(self): |
"""Get all free ebook urls""" |
furl = self.base_url+'/free/' |
try: |
req = requests.get(furl) |
req.raise_for_status() |
except Exception as e: |
print(u"ERROR: Cannot handle '{}' >>> ({})".format(furl, e)) |
return list() |
html = HTML.fromstring(req.text) |
eurls = html.xpath('//div[@name="FreeEbooks"]//a[@class="item-title"]/@href') |
pool = multiprocessing.Pool(processes=self.processes) |
urls = pool.map(self.expand_url, eurls) |
urls = list(sorted([u for u in set(urls) if u])) |
if self.categories: |
urls = list(set([u for u in urls if self.category_from_url(u) in self.categories])) |
categories = self.categories |
else: |
categories = pool.map(self.category_from_url, urls) |
categories = list(sorted([c for c in set(categories) if c])) # remove duplicates |
print("Downloading available free ebooks from the following categorie(s):\n{}".format(', '.join(categories))) |
print("*" * 80) |
print("Saving to the following directory:\n{}".format(os.path.abspath(self.outdir))) |
print("*" * 80) |
dl_urls = pool.map(self.get_dl_links, urls) |
dl_flat_list = [item for sublist in dl_urls for item in sublist] |
pool.map(self.download, dl_flat_list) |
pool.close() # no more tasks |
pool.join() # wrap up current tasks |
if self.dryrun: |
print("\nINFO: {} possible ebooks in '{}' categorie(s).".format(int(len(dl_flat_list)/3), ', '.join(categories))) |
return |
#################################################################################################### |
if __name__ == "__main__": |
""" |
Main Program |
Examples: |
1 : oreillyurls_2.py -p 4 |
2 : oreillyurls_2.py -c 'data' |
3 : oreillyurls_2.py -c ['programming', 'data'] -o ~/Downloads/ebooks-tmp |
""" |
parser = AP(description='Download all free ebooks (epub, mobi, pdf) from oreilly.com') |
parser.add_argument('-c', '--categories', dest='categories', help='Specify custom "categories". Can be single or list.') |
parser.add_argument('-t', '--types', dest='etypes', help='Specify custom "ebook types". Can be single or list.') |
parser.add_argument('-p', '--processes', dest='processes', help='Specify custom processes count. Used in multithreading.') |
parser.add_argument('-o', '--out', dest='outdir', default='ebooks-oreilly', help='Output directory') |
parser.add_argument('-d', '--dryrun', dest='dryrun', default=False, action='store_true', help='Do a dry run without downloading.') |
parser.add_argument('-v', '--verbose', dest='verbose', default=0, action="count", help='Enable/Disable verbose logging.') |
argcomplete.autocomplete(parser) |
args = parser.parse_args() |
OReillyEbookFreeScraper(**vars(args)) |