-
-
Save zouyangdev/746e2dc19145bb8569c92965942530ea to your computer and use it in GitHub Desktop.
PacktPub e-books downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import logging | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
log = logging.getLogger(__name__) | |
log.addHandler(logging.NullHandler()) | |
class PacktBooks: | |
_URL = {'login': 'https://www.packtpub.com/', | |
'freebook': 'https://www.packtpub.com/packt/offers/free-learning', | |
'mybooks': 'https://www.packtpub.com/account/my-ebooks'} | |
_AVAIL_BOOK_TYPES = {'pdf', 'epub', 'mobi', 'code'} | |
def __init__(self, username, password): | |
self._session = None | |
self._book_types = self._AVAIL_BOOK_TYPES | |
self._username = username | |
self._password = password | |
self._logged_in = False | |
def _create_session(self): | |
self._session = requests.Session() | |
self._session.headers.update({'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Encoding': 'gzip, deflate', | |
'Connection': 'keep-alive', | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) | |
@property | |
def book_types(self): | |
return ','.join(self._book_types) | |
@book_types.setter | |
def book_types(self, types): | |
if isinstance(types, str): | |
self._book_types = set([types]) & self._AVAIL_BOOK_TYPES | |
else: | |
self._book_types = set(types) & self._AVAIL_BOOK_TYPES | |
log.debug('Set wanted book types to {}'.format(self.book_types)) | |
def login(self): | |
if self._session is None: | |
self._create_session() | |
payload = {'email': self._username, | |
'password': self._password, | |
'op': 'Login'} | |
# Retrieve values for form_id and form_build_id | |
soup = BeautifulSoup(requests.get(self._URL['login']).content, | |
'lxml') | |
# Add hidden form fields | |
form = soup.find('form', {'id': 'packt-user-login-form'}) | |
for elem in form.find_all('input', type='hidden'): | |
payload[elem['name']] = elem['value'] | |
# Login | |
r = self._session.post(url=self._URL['login'], | |
data=payload) | |
r.raise_for_status() | |
self._logged_in = True | |
def logoff(self): | |
if self._session: | |
self._session.close() | |
self._logged_in = False | |
def _get_books(self): | |
# Prepare the regular expressions for books and code download links | |
re_book_types = re.compile('^/ebook_download/.*/(?:' + '|'.join(self._book_types) + ')') | |
re_book_code = re.compile('^/code_download/[0-9]+') | |
r = self._session.get(self._URL['mybooks']) | |
r.raise_for_status() | |
soup = BeautifulSoup(r.content, 'lxml') | |
book_list = soup.find('div', id='product-account-list') | |
for book in book_list.find_all('div', class_='product-line'): | |
ebook={} | |
if not book.has_attr('title'): | |
log.debug('Skipping fake book entry') | |
continue | |
ebook['title'] = book['title'][:-8] # Remove the '(ebook)' postfix | |
log.debug('Found "{}"'.format(ebook['title'])) | |
links={} | |
for link in book.find_all('a', href=re_book_types): | |
filetype = link['href'].rsplit('/', maxsplit=1)[1] | |
links[filetype] = link['href'] | |
if 'code' in self._book_types: | |
for link in book.find_all('a', href=re_book_code): | |
links['zip'] = link['href'] | |
log.debug('{} link(s) found'.format(len(links))) | |
ebook['links'] = links | |
yield ebook | |
def download_books(self, dl_folder, | |
organize_in_folders=True, | |
overwrite=True): | |
if not self._logged_in: | |
self.login() | |
dl_folder = Path(dl_folder) | |
for book in self._get_books(): | |
try: | |
safe_title = self._safe_filename(book['title']) | |
if organize_in_folders: | |
book_path = Path(dl_folder, safe_title) | |
book_path.mkdir() | |
else: | |
book_path = dl_folder | |
for filetype, link in book['links'].items(): | |
try: | |
filename = '{0}.{1}'.format(safe_title, filetype) | |
file_path = Path(book_path, filename) | |
if not overwrite and file_path.exists(): | |
log.debug('File already exists, skipping') | |
continue | |
link_url='https://www.packtpub.com{}'.format(link) | |
r = self._session.get(link_url) | |
r.raise_for_status() | |
with open(str(file_path), 'wb') as f: | |
f.write(r.content) | |
except Exception: | |
log.exception('Unable to download {} for {}'.format(filetype, | |
book['title'])) | |
log.debug('Downloaded {}'.format(book['title'])) | |
except Exception as e: | |
log.exception('Unable to save {}'.format(book['title'])) | |
def claim_book(self): | |
try: | |
soup = BeautifulSoup(self._session.get(self._URL['freebook']).content, 'lxml') | |
claim_link = soup.find('a', href=re.compile('^/freelearning-claim/.*')) | |
claim_url = 'https://www.packtpub.com{}'.format(claim_link['href']) | |
r = self._session.get(claim_url) | |
r.raise_for_status() | |
except Exception as e: | |
log.exception('Unable to claim book') | |
@staticmethod | |
def _safe_filename(filename): | |
return "".join(x if x.isalnum() else "_" for x in filename).strip('_') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment