|
<?php |
|
|
|
define('DS', DIRECTORY_SEPARATOR); |
|
// Check if in Docker Container |
|
define('CGROUP_FILE', '/proc/1/cgroup'); |
|
define('IN_DOCKER', file_exists(CGROUP_FILE) && false !== preg_match('#\:/docker/#', file_get_contents(CGROUP_FILE))); |
|
|
|
// config values |
|
$saveParentDir = __DIR__; // Parent dir of the ebooks and extras directories |
|
$ebooksDir = 'ebooks'; // path of the ebooks directory relative to $saveParentDir |
|
$extrasDir = 'ebooks' . DS . 'extras'; // path of the extras directory relative to $saveParentDir |
|
$sleepDuration = 4; // Time to delay between page requests / different book downloads |
|
$booksPerListPage = 24; // Book details to try requesting from the PacktPub API. This can be max 25 |
|
$fileTypesWanted = ['epub', 'mobi', 'pdf', 'code', 'video']; // Different file types from BOOK_FORMATS_URL you want to download |
|
$downloadFrontCover = true; // Whether or not you want the book front cover downloading (if available) |
|
$startIndex = false; // If set to a number this will be the first book downloaded of a range |
|
$endIndex = false; // If set to a number this will be the last book downloaded of a range |
|
|
|
// These are required from the website cookies to correctly authenticate |
|
$accessToken = ''; // Access Token. Obtain using associated user.js with TamperMonkey, or use dev-tools console |
|
$sessionId = ''; // Packt Session. Obtain using associated user.js with TamperMonkey, or use dev-tools console |
|
|
|
define( |
|
'OWNED_BOOKS_URL', |
|
'https://subscription.packtpub.com/api/entitlements/users/me/owned?sort=createdAt:desc&search=&limit=%d&offset=%d' |
|
); |
|
define('BOOK_FORMATS_URL', 'https://services.packtpub.com/products-v1/products/%d/types'); |
|
define('FILE_DOWNLOAD_DETAILS_URL', 'https://services.packtpub.com/products-v1/products/%d/files/%s'); |
|
define('BOOK_SUMMARY_URL', 'https://static.packt-cdn.com/products/%d/summary'); |
|
|
|
$defaultCurlOptions = [ |
|
CURLOPT_RETURNTRANSFER => true, |
|
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36', |
|
]; |
|
|
|
// Used to clear the current output line |
|
// pinched from symfony console: https://github.com/symfony/console/blob/master/Helper/ProgressIndicator.php#L211 |
|
define('CLEAR_LINE', "\x0D\x1B[2K"); |
|
// I might write this properly, and use symfony console someday! |
|
|
|
echo 'Confiruration Options:', PHP_EOL; |
|
|
|
$cliOptMap = [ |
|
'save-parent-dir:' => 'saveParentDir', |
|
'ebooks-dir:' => 'ebooksDir', |
|
'extras-dir:' => 'extrasDir', |
|
'sleep-duration:' => 'sleepDuration', |
|
'books-per-list-page:' => 'booksPerListPage', |
|
'file-types-wanted:' => 'fileTypesWanted', |
|
'download-front-cover:' => 'downloadFrontCover', |
|
'start-index:' => 'startIndex', |
|
'end-index:' => 'endIndex', |
|
]; |
|
|
|
$cliOptValues = getopt('', array_keys($cliOptMap)); |
|
|
|
if (IN_DOCKER) { |
|
if (!empty($cliOptValues['save-parent-dir'])) { |
|
echo 'Ignoring --save-parent-dir as running inside docker container. Mount a volume to /mnt instead.', PHP_EOL; |
|
unset($cliOptValues['save-parent-dir']); |
|
} |
|
$saveParentDir = '/mnt'; |
|
} |
|
|
|
foreach ($cliOptMap as $cliOptName => $varName) { |
|
$nameToUse = str_replace(':', '', $cliOptName); |
|
if (!empty($cliOptValues[$nameToUse])) { |
|
$valToAssign = $cliOptValues[$nameToUse]; |
|
if ($varName == 'fileTypesWanted' && !is_array($valToAssign)) { |
|
$valToAssign = [$valToAssign]; |
|
} |
|
// I know, variable variables are awful, but it's got some protection as they're known |
|
// variable names, and at-least it's not PHP 4/5's register_globals! I'm just being lazy ;-P |
|
$$varName = $valToAssign; |
|
} |
|
echo $nameToUse, ' = ', var_export($$varName), PHP_EOL; |
|
} |
|
|
|
// These are env vars rather than arguments due to their sensitive nature. If using docker run, use --env, -e or --env-file |
|
$envVarMap = [ |
|
'PP_ACCESS_TOKEN' => 'accessToken', |
|
'PP_SESSION_ID' => 'sessionId' |
|
]; |
|
|
|
foreach ($envVarMap as $envVarName => $varName) { |
|
if (!empty($_ENV[$envVarName])) { |
|
// see above note on variable variables |
|
$$varName = $_ENV[$envVarName]; |
|
} |
|
echo $varName, ' = ', var_export($$varName), PHP_EOL; |
|
} |
|
|
|
function errorAndDie($message) |
|
{ |
|
echo $message, PHP_EOL; |
|
die; |
|
} |
|
|
|
function getAccessTokenExpiry($accessToken) |
|
{ |
|
// It's a JWT, so we can easily extract it out |
|
list(, $tokenDataBase64, ) = explode('.', $accessToken); |
|
$tokenData = json_decode(base64_decode($tokenDataBase64)); |
|
return $tokenData->exp; |
|
} |
|
|
|
// Human readable format, taken from here: https://stackoverflow.com/questions/15188033/human-readable-file-size#answer-23888858 |
|
function sizeToHuman($bytes) |
|
{ |
|
$size = array('B', 'kB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB'); |
|
$factor = floor((strlen($bytes) - 1) / 3); |
|
$dec = ($bytes > 0) ? 2 : 0; |
|
|
|
return sprintf("%.{$dec}f %s", $bytes / (1000 ** $factor), @$size[$factor]); |
|
} |
|
|
|
$getUrl = function($url, $errorMessage, $extraOptions = []) use ($defaultCurlOptions, &$accessToken, &$sessionId) |
|
{ |
|
if (!empty($accessToken)) { |
|
if (!isset($extraOptions[CURLOPT_HTTPHEADER])) { |
|
$extraOptions[CURLOPT_HTTPHEADER] = []; |
|
} |
|
$extraOptions[CURLOPT_HTTPHEADER][] = 'X-Xsrf-Token: ' . $accessToken; |
|
$extraOptions[CURLOPT_COOKIE] = 'packt_session=' . $sessionId . '; XSRF-TOKEN=' . $accessToken; |
|
} |
|
|
|
$ch = curl_init($url); |
|
curl_setopt_array($ch, $defaultCurlOptions + $extraOptions); |
|
|
|
$response = curl_exec($ch); |
|
$responseCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
|
|
|
if ($responseCode !== 200) { |
|
errorAndDie($errorMessage. ': '. $responseCode); |
|
} |
|
return $response; |
|
}; |
|
|
|
$getJson = function ($url, $errorMessage, $extraOptions = []) |
|
use ($getUrl, $defaultCurlOptions, &$accessToken, &$sessionId) |
|
{ |
|
if (null === ($decodedJson = json_decode($getUrl($url, $errorMessage, $extraOptions)))) { |
|
errorAndDie($errorMessage); |
|
} |
|
|
|
return $decodedJson; |
|
}; |
|
|
|
$getHtmlXPath = function ($url, $path, $errorMessage, $extraOptions = []) |
|
use ($getUrl, $defaultCurlOptions, &$accessToken, &$sessionId) |
|
{ |
|
$response = $getUrl($url, $errorMessage, $extraOptions); |
|
|
|
$dd = new DomDocument(); |
|
@$dd->loadHTML($response); |
|
$domXpath = new DOMXPath($dd); |
|
$elements = $domXpath->query($path); |
|
if (!$elements->length) { |
|
errorAndDie($errorMessage); |
|
} |
|
|
|
return $elements; |
|
}; |
|
|
|
$downloadFile = function ($url, $savePath, $errorMessage) use ($defaultCurlOptions) |
|
{ |
|
$fh = fopen($savePath, 'w+'); |
|
|
|
$indicatedFilesize = 0; |
|
$contentDispositionSet = false; |
|
|
|
$ch = curl_init($url); |
|
curl_setopt_array( |
|
$ch, |
|
$defaultCurlOptions + [ |
|
CURLOPT_FOLLOWLOCATION => true, |
|
CURLOPT_RETURNTRANSFER => false, |
|
CURLOPT_FILE => $fh, |
|
CURLOPT_HEADERFUNCTION => function($ch, $header) use (&$indicatedFilesize, &$contentDispositionSet) { |
|
static $setFilesize = false; |
|
if (!$setFilesize) { |
|
$headerParts = explode(':', $header, 2); |
|
if (2 == count($headerParts)) { |
|
$headerName = strtolower(trim($headerParts[0])); |
|
switch ($headerName) { |
|
case 'content-length': |
|
$indicatedFilesize = intval(trim($headerParts[1])); |
|
if ($indicatedFilesize > 0) { |
|
echo ' , filesize: ', sizeToHuman($indicatedFilesize), PHP_EOL; |
|
$setFilesize = true; |
|
} |
|
break; |
|
case 'content-disposition': |
|
if (!empty(trim($headerParts[1]))) { |
|
$contentDispositionSet = true; |
|
} |
|
break; |
|
} |
|
} |
|
} |
|
return strlen($header); |
|
}, |
|
CURLOPT_WRITEFUNCTION => function($ch, $data) use ($fh, &$indicatedFilesize, &$contentDispositionSet) { |
|
if ($indicatedFilesize > 0 || $contentDispositionSet) { |
|
static $downloadedFilesize = 0; |
|
if ($downloadedFilesize === 0 && $indicatedFilesize === 0) { |
|
echo PHP_EOL; |
|
} |
|
$downloadedFilesize += strlen($data); |
|
if ($indicatedFilesize > 0) { |
|
$downloadedPercentage = $downloadedFilesize / $indicatedFilesize; |
|
echo CLEAR_LINE; |
|
if ($downloadedPercentage < 1) { |
|
echo 'Downloaded ', number_format($downloadedPercentage * 100, 2, '.', ''), '%'; |
|
} |
|
} elseif ($contentDispositionSet) { |
|
echo CLEAR_LINE; |
|
echo 'Downloaded ', sizeToHuman($downloadedFilesize); |
|
} |
|
return fwrite($fh, $data); |
|
} |
|
return false; |
|
} |
|
] |
|
); |
|
curl_exec($ch); |
|
$responseCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); |
|
fclose($fh); |
|
|
|
if ($contentDispositionSet && $indicatedFilesize === 0) { |
|
echo PHP_EOL; |
|
} |
|
if ($responseCode !== 200 || !filesize($savePath)) { |
|
echo $errorMessage, ': ', $responseCode, PHP_EOL; |
|
return false; |
|
} |
|
return true; |
|
}; |
|
|
|
if (empty($accessToken)) { |
|
errorAndDie('Missing auth token. Stopping script.'); |
|
} |
|
|
|
echo 'Sleeping for ', $sleepDuration, ' seconds', PHP_EOL; |
|
sleep($sleepDuration); |
|
|
|
$bookCount = 0; |
|
echo 'Getting list of eBooks', PHP_EOL; |
|
$booksInfo = $getJson( |
|
sprintf(OWNED_BOOKS_URL, $booksPerListPage, $bookCount), |
|
'Couldn\'t retrieve list of books' |
|
); |
|
$totalNumberOfBooks = $booksInfo->count; |
|
$noOfPages = ceil($totalNumberOfBooks / $booksPerListPage); |
|
|
|
echo 'Total number of books: ', $totalNumberOfBooks, ', Total number of pages: ', $noOfPages, PHP_EOL; |
|
|
|
for ($pageCount = 1; $pageCount <= $noOfPages; $pageCount++) { |
|
|
|
if ($pageCount > 1) { |
|
$booksInfo = $getJson( |
|
sprintf(OWNED_BOOKS_URL, $booksPerListPage, $bookCount), |
|
'Couldn\'t retrieve list of books' |
|
); |
|
} |
|
|
|
$pageBooksCount = count($booksInfo->data); |
|
|
|
echo 'Found ', $pageBooksCount, ' books on page ', $pageCount, PHP_EOL; |
|
|
|
if (count($booksInfo->data)) { |
|
if (!file_exists($saveParentDir . DS . $ebooksDir)) { |
|
mkdir($saveParentDir . DS . $ebooksDir); |
|
} |
|
if (!file_exists($saveParentDir . DS . $extrasDir)) { |
|
mkdir($saveParentDir . DS . $extrasDir); |
|
} |
|
foreach ($booksInfo->data as $bookData) { |
|
$bookCount++; |
|
|
|
if ($startIndex !== false && $bookCount < $startIndex) { |
|
continue; |
|
} |
|
|
|
$name = $bookData->productName; |
|
echo $bookCount, '. Examining "', $name, '"', PHP_EOL; |
|
$fileName = preg_replace(['/[\<\>\:\"\/\\\|\?\*\%]+/', '/\s+/', '/[\[\]]/'], ['-', '_', ''], $name); |
|
|
|
if (empty($bookData->simplifiedProduct) || empty($bookData->simplifiedProduct->readUrl)) { |
|
errorAndDie('Couldn\'t find book read URL'); |
|
} |
|
|
|
$xpathElements = $getHtmlXPath( |
|
'https://subscription.packtpub.com' . $bookData->simplifiedProduct->readUrl, |
|
'.//div[contains(@class, "reader-rhs-code-download")]//div[contains(@class, "book")]' . |
|
'//div//div[contains(@class, "header-buttons")]//div[contains(@class, "header-button")]//a', |
|
'Couldn\'t get book html', |
|
[CURLOPT_FOLLOWLOCATION => true] |
|
); |
|
|
|
$downloadLinks = []; |
|
foreach ($xpathElements as $bookLink) { |
|
if (!empty($bookLink->firstChild) && !empty($bookLink->firstChild->textContent)) { |
|
$fileType = $bookLink->firstChild->textContent; |
|
$href = $bookLink->getAttribute('href'); |
|
if ($fileType == 'code' |
|
&& 0 === strpos($href, 'https://github.com/') |
|
&& "zip" != preg_replace('/^.+\.([^\.\/]+)$/', '$1', $href) |
|
) { |
|
$href .= '/archive/refs/heads/master.zip'; |
|
} |
|
$downloadLinks[$fileType] = $href; |
|
} |
|
} |
|
|
|
if (0 === count($downloadLinks)) { |
|
echo 'No Downloadable Books / Code', PHP_EOL; |
|
continue; |
|
} |
|
foreach ($downloadLinks as $format => $downloadHref) { |
|
$savePath = ('code' === $format) |
|
? $saveParentDir . DS . $extrasDir . DS . $fileName . '.zip' |
|
: $saveParentDir . DS . $ebooksDir . DS . $fileName . '.' . (('video' === $format) ? 'zip' : $format); |
|
echo 'Downloading ', $format, ' to ', $savePath; |
|
|
|
$downloadFile($downloadHref, $savePath, $format . ' download failed'); |
|
} |
|
if ($downloadFrontCover) { |
|
if (!empty($bookData->simplifiedProduct) && !empty($bookData->simplifiedProduct->coverImage)) { |
|
$coverImageUrl = $bookData->simplifiedProduct->coverImage; |
|
if (!empty($coverImageUrl) |
|
&& false !== ($urlDetails = parse_url($coverImageUrl)) |
|
&& !empty($urlDetails['path']) |
|
) { |
|
$fileExt = preg_replace('/^.+\.([^\.]+)$/', '$1', $urlDetails['path']); |
|
$savePath = $saveParentDir . DS . $extrasDir . DS . $fileName . '.' . $fileExt; |
|
echo 'Downloading Front Cover to: ', $savePath; |
|
$downloadFile( |
|
$coverImageUrl, |
|
$savePath, |
|
'Front cover download failed' |
|
); |
|
} |
|
} |
|
} |
|
|
|
if ($endIndex !== false && $bookCount >= $endIndex) { |
|
break 2; |
|
} |
|
|
|
echo 'Sleeping for ', $sleepDuration, ' seconds', PHP_EOL; |
|
sleep($sleepDuration); |
|
} |
|
} |
|
} |
No longer works due to changes in site - quick hack I put together here you can look at to see new method against their new REST endpoints which is much simpler than site parsing. https://gist.github.com/nneul/6eda98fd87a58a623b857523247f3471