-
-
Save ChristianBagley/a65ac64f20e7a68fa4c7531fb8cc00a9 to your computer and use it in GitHub Desktop.
Download Free O'Reilly eBooks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
from glob import glob # Google this. It's really good! | |
from os.path import basename, join # Basename removes the path part of a file, returning only the filename. Can also handle urls. | |
from concurrent.futures import ThreadPoolExecutor # For our concurrent downloads in download_many. | |
def get_urls(data, pattern): | |
urls = pattern.findall(data.text) | |
endings = ['.pdf', '.mobi', '.epub'] | |
combinations = (url.replace('.csp', ending) | |
.replace('/free/', '/free/files/') | |
for url in urls | |
for ending in endings) | |
return combinations | |
def filter_already_downloaded(urls, path): | |
already_downloaded = [basename(item) for item in glob(join(path, '*'))] | |
return (url for url in urls if basename(url) not in already_downloaded) | |
def download_one(url, path): | |
filename = basename(url) | |
target = join(path, filename) | |
try: | |
print("Downloading %s" % filename) | |
with open(target, 'wb+') as f: | |
response = requests.get(url, stream=True) | |
for chunk in response.iter_content(4096): | |
f.write(chunk) | |
print("Finished downloading %s" % filename) | |
except Exception as e: | |
print("Error downloading %s: %s" % (filename, e)) | |
def download_many(urls, path, threads=4): | |
with ThreadPoolExecutor(threads) as tpe: | |
for url in urls: | |
tpe.submit(download_one, url, path) | |
def main(): | |
data = requests.get('http://www.oreilly.com/programming/free/') | |
pattern = re.compile('http://www.oreilly.com/programming/free.*\.csp') | |
path = 'your/books/here' | |
all_urls = get_urls(data, pattern) | |
filtered_urls = filter_already_downloaded(all_urls, path) | |
download_many(filtered_urls, path) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment