santiagobasulto · October 3, 2024 18:36 · GhostofGoes · Sep 11, 2018 · bjarnebuchmann · Aug 6, 2020
diff --git a/README.md b/README.md
diff --git a/hb_download.py b/hb_download.py
 import argparse
 from pathlib import Path
 from urllib.parse import urlparse

 import requests
 from bs4 import BeautifulSoup


 def parse_download_links(html_file_content):
    soup = BeautifulSoup(html_file_content)
    external_wrapper_div = soup.find('div', class_='js-all-downloads-holder')
    wrapper_div = external_wrapper_div.find('div', class_='whitebox-redux')

    books = []

    for div in wrapper_div.find_all('div'):
        data_div = div.find('div', attrs={'data-human-name': True})
        if not data_div:
            continue
        download_div = div.find('div', class_='download-buttons')
        download_links = {}
        for button_div in download_div.find_all('div', class_='small'):
            label = button_div.find('span', class_='label').text
            download_link = button_div.find(
                'a', class_='a', attrs={'href': True})['href']

            download_links[label] = download_link

        books.append({
            'title': data_div['data-human-name'],
            'slug': data_div['data-human-name'].lower().replace(' ', '-'),
            'download_links': download_links
        })
    return books


 def safe_create_dir(path):
    path.mkdir(exist_ok=True)


 def download_file_from_url(base_path, url, chunk_size=None):
    chunk_size = chunk_size or (4 * 1024)
    filename = urlparse(url).path.replace('/', '')
    book_path = base_path / filename
    if book_path.exists():
        # book already downloaded
        return (book_path, False)
    with requests.get(url, stream=True) as resp:
        with book_path.open('wb') as fp:
            for chunk in resp.iter_content(chunk_size=chunk_size):
                if chunk:
                    fp.write(chunk)

    return (book_path, True)


 def download_books(html_file_content, download_dir='./books', pdf=False, epub=False, mobi=False):
    books_parsed = parse_download_links(html_file_content)
    base_path = Path(download_dir)
    safe_create_dir(base_path)
    for book in books_parsed:
        book_base_path = base_path / book['title']
        safe_create_dir(book_base_path)
        download_urls = [
            url for should_download, url in [
                (pdf, book['download_links'].get('PDF')),
                (mobi, book['download_links'].get('MOBI')),
                (epub, book['download_links'].get('EPUB')),
            ]
            if should_download
        ]

        for url in download_urls:
            result, downloaded = download_file_from_url(book_base_path, url)
            if not downloaded:
                print("Skipped: ", result)
            else:
                print("Downloaded: ", result)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Download ')
    parser.add_argument(
        'html_file', type=argparse.FileType(),
        help='HTML file to download books from')
    parser.add_argument(
        '-d', '--destination-dir', type=str,
        help="Directory where books will be saved", default='books')
    parser.add_argument('--epub', action='store_true', default=True)
    parser.add_argument('--pdf', action='store_true')
    parser.add_argument('--mobi', action='store_true')

    args = parser.parse_args()
    html = args.html_file.read()

    download_books(
        html, args.destination_dir,
        pdf=args.pdf, epub=args.epub, mobi=args.mobi,
    )
	import argparse
	from pathlib import Path
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup


	def parse_download_links(html_file_content):
	soup = BeautifulSoup(html_file_content)
	external_wrapper_div = soup.find('div', class_='js-all-downloads-holder')
	wrapper_div = external_wrapper_div.find('div', class_='whitebox-redux')

	books = []

	for div in wrapper_div.find_all('div'):
	data_div = div.find('div', attrs={'data-human-name': True})
	if not data_div:
	continue
	download_div = div.find('div', class_='download-buttons')
	download_links = {}
	for button_div in download_div.find_all('div', class_='small'):
	label = button_div.find('span', class_='label').text
	download_link = button_div.find(
	'a', class_='a', attrs={'href': True})['href']

	download_links[label] = download_link

	books.append({
	'title': data_div['data-human-name'],
	'slug': data_div['data-human-name'].lower().replace(' ', '-'),
	'download_links': download_links
	})
	return books


	def safe_create_dir(path):
	path.mkdir(exist_ok=True)


	def download_file_from_url(base_path, url, chunk_size=None):
	chunk_size = chunk_size or (4 * 1024)
	filename = urlparse(url).path.replace('/', '')
	book_path = base_path / filename
	if book_path.exists():
	# book already downloaded
	return (book_path, False)
	with requests.get(url, stream=True) as resp:
	with book_path.open('wb') as fp:
	for chunk in resp.iter_content(chunk_size=chunk_size):
	if chunk:
	fp.write(chunk)

	return (book_path, True)


	def download_books(html_file_content, download_dir='./books', pdf=False, epub=False, mobi=False):
	books_parsed = parse_download_links(html_file_content)
	base_path = Path(download_dir)
	safe_create_dir(base_path)
	for book in books_parsed:
	book_base_path = base_path / book['title']
	safe_create_dir(book_base_path)
	download_urls = [
	url for should_download, url in [
	(pdf, book['download_links'].get('PDF')),
	(mobi, book['download_links'].get('MOBI')),
	(epub, book['download_links'].get('EPUB')),
	]
	if should_download
	]

	for url in download_urls:
	result, downloaded = download_file_from_url(book_base_path, url)
	if not downloaded:
	print("Skipped: ", result)
	else:
	print("Downloaded: ", result)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Download ')
	parser.add_argument(
	'html_file', type=argparse.FileType(),
	help='HTML file to download books from')
	parser.add_argument(
	'-d', '--destination-dir', type=str,
	help="Directory where books will be saved", default='books')
	parser.add_argument('--epub', action='store_true', default=True)
	parser.add_argument('--pdf', action='store_true')
	parser.add_argument('--mobi', action='store_true')

	args = parser.parse_args()
	html = args.html_file.read()

	download_books(
	html, args.destination_dir,
	pdf=args.pdf, epub=args.epub, mobi=args.mobi,
	)