Skip to content

Instantly share code, notes, and snippets.

@ChristianBagley
Forked from jheiselman/get_books.py
Created February 12, 2017 20:38
Show Gist options
  • Save ChristianBagley/a65ac64f20e7a68fa4c7531fb8cc00a9 to your computer and use it in GitHub Desktop.
Save ChristianBagley/a65ac64f20e7a68fa4c7531fb8cc00a9 to your computer and use it in GitHub Desktop.
Download Free O'Reilly eBooks
import requests
import re
from glob import glob # Google this. It's really good!
from os.path import basename, join # Basename removes the path part of a file, returning only the filename. Can also handle urls.
from concurrent.futures import ThreadPoolExecutor # For our concurrent downloads in download_many.
def get_urls(data, pattern):
urls = pattern.findall(data.text)
endings = ['.pdf', '.mobi', '.epub']
combinations = (url.replace('.csp', ending)
.replace('/free/', '/free/files/')
for url in urls
for ending in endings)
return combinations
def filter_already_downloaded(urls, path):
already_downloaded = [basename(item) for item in glob(join(path, '*'))]
return (url for url in urls if basename(url) not in already_downloaded)
def download_one(url, path):
filename = basename(url)
target = join(path, filename)
try:
print("Downloading %s" % filename)
with open(target, 'wb+') as f:
response = requests.get(url, stream=True)
for chunk in response.iter_content(4096):
f.write(chunk)
print("Finished downloading %s" % filename)
except Exception as e:
print("Error downloading %s: %s" % (filename, e))
def download_many(urls, path, threads=4):
with ThreadPoolExecutor(threads) as tpe:
for url in urls:
tpe.submit(download_one, url, path)
def main():
data = requests.get('http://www.oreilly.com/programming/free/')
pattern = re.compile('http://www.oreilly.com/programming/free.*\.csp')
path = 'your/books/here'
all_urls = get_urls(data, pattern)
filtered_urls = filter_already_downloaded(all_urls, path)
download_many(filtered_urls, path)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment