|
#!/bin/bash |
|
# |
|
# fetchdoc - Download Red Hat documentation PDFs from docs.redhat.com |
|
# Version: 1.0.0 |
|
# |
|
# Downloads PDF versions of documentation by scraping HTML documentation |
|
# index pages and converting links to PDF format. |
|
|
|
set -e |
|
set -o pipefail |
|
|
|
VERSION="1.0.0" |
|
PARALLEL_JOBS=10 |
|
VERBOSE=0 |
|
CHECK_UPDATES=0 |
|
|
|
usage() { |
|
cat << EOF |
|
Usage: $(basename "$0") [OPTIONS] <URL> |
|
|
|
Download Red Hat documentation PDFs from a documentation index page. |
|
|
|
OPTIONS: |
|
-h, --help Show this help message |
|
-v, --verbose Show detailed download progress |
|
-p, --parallel NUM Number of parallel downloads (default: 10) |
|
-u, --check-updates Only download if remote PDF is newer than local |
|
--version Show version information |
|
|
|
EXAMPLES: |
|
# Download RHEL 9 documentation |
|
$(basename "$0") https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9 |
|
|
|
# Download with verbose output and 5 parallel jobs |
|
$(basename "$0") -v -p 5 https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9 |
|
|
|
# Only download updated PDFs (checks creation dates) |
|
$(basename "$0") -u https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9 |
|
|
|
EOF |
|
exit 0 |
|
} |
|
|
|
show_version() { |
|
echo "fetchdoc version $VERSION" |
|
exit 0 |
|
} |
|
|
|
log_verbose() { |
|
if [ "$VERBOSE" -eq 1 ]; then |
|
echo "$@" |
|
fi |
|
} |
|
|
|
log_error() { |
|
echo "ERROR: $@" >&2 |
|
} |
|
|
|
# Parse options |
|
while [ $# -gt 0 ]; do |
|
case "$1" in |
|
-h|--help) |
|
usage |
|
;; |
|
--version) |
|
show_version |
|
;; |
|
-v|--verbose) |
|
VERBOSE=1 |
|
shift |
|
;; |
|
-p|--parallel) |
|
PARALLEL_JOBS="$2" |
|
if ! [[ "$PARALLEL_JOBS" =~ ^[0-9]+$ ]] || [ "$PARALLEL_JOBS" -lt 1 ]; then |
|
log_error "Parallel jobs must be a positive integer" |
|
exit 1 |
|
fi |
|
shift 2 |
|
;; |
|
-u|--check-updates) |
|
CHECK_UPDATES=1 |
|
shift |
|
;; |
|
-*) |
|
log_error "Unknown option: $1" |
|
echo "Use -h or --help for usage information" |
|
exit 1 |
|
;; |
|
*) |
|
break |
|
;; |
|
esac |
|
done |
|
|
|
# Check required argument |
|
if [ -z "$1" ]; then |
|
log_error "Missing required URL argument" |
|
echo "Use -h or --help for usage information" |
|
exit 1 |
|
fi |
|
|
|
URL="$1" |
|
|
|
# Validate URL format |
|
if ! [[ "$URL" =~ ^https?:// ]]; then |
|
log_error "Invalid URL format: $URL" |
|
exit 1 |
|
fi |
|
|
|
# Check dependencies |
|
for cmd in curl grep awk; do |
|
if ! command -v "$cmd" >/dev/null 2>&1; then |
|
log_error "Required command not found: $cmd" |
|
exit 1 |
|
fi |
|
done |
|
|
|
# Check pdfinfo if update checking is enabled |
|
if [ "$CHECK_UPDATES" -eq 1 ]; then |
|
if ! command -v pdfinfo >/dev/null 2>&1; then |
|
log_error "pdfinfo is required for --check-updates (install poppler-utils)" |
|
exit 1 |
|
fi |
|
fi |
|
|
|
# Extract base URL and documentation path |
|
BASE_URL=$(echo "$URL" | awk -F/ '{print $1"//"$3}') |
|
# Extract the product/version path (e.g., /en/documentation/red_hat_enterprise_linux/9) |
|
DOC_PATH=$(echo "$URL" | sed 's|^https\?://[^/]*/||' | sed 's|/\?$||') |
|
|
|
log_verbose "Fetching documentation index from: $URL" |
|
log_verbose "Base URL: $BASE_URL" |
|
log_verbose "Documentation path: $DOC_PATH" |
|
log_verbose "Parallel jobs: $PARALLEL_JOBS" |
|
|
|
# Download index page and extract documentation links |
|
INDEX_PAGE=$(curl -sf "$URL") |
|
if [ $? -ne 0 ]; then |
|
log_error "Failed to fetch index page: $URL" |
|
exit 1 |
|
fi |
|
|
|
# Extract /html/ links and filter to only include links matching the documentation path |
|
# This prevents downloading cross-references to other versions |
|
HTML_LINKS=$(echo "$INDEX_PAGE" | grep -oP '(?<=href=")[^"]*' | grep '/html/' | grep "^/$DOC_PATH/" || true) |
|
|
|
if [ -z "$HTML_LINKS" ]; then |
|
log_error "No documentation links found at: $URL" |
|
exit 1 |
|
fi |
|
|
|
LINK_COUNT=$(echo "$HTML_LINKS" | wc -l) |
|
log_verbose "Found $LINK_COUNT documentation links" |
|
|
|
# Get PDF creation date from metadata |
|
get_pdf_date() { |
|
local pdf_file="$1" |
|
|
|
if [ ! -f "$pdf_file" ]; then |
|
echo "" |
|
return 1 |
|
fi |
|
|
|
# Extract CreationDate from PDF metadata |
|
local date_str=$(pdfinfo "$pdf_file" 2>/dev/null | grep "^CreationDate:" | sed 's/CreationDate:[[:space:]]*//') |
|
|
|
if [ -z "$date_str" ]; then |
|
echo "" |
|
return 1 |
|
fi |
|
|
|
# Convert to Unix timestamp for comparison |
|
# Format: "Wed Mar 11 17:00:08 2026 JST" or similar |
|
date -d "$date_str" +%s 2>/dev/null || echo "" |
|
} |
|
|
|
# Download function for background jobs |
|
download_pdf() { |
|
local relative_url="$1" |
|
local full_url="${BASE_URL}${relative_url}" |
|
|
|
# Remove trailing /index if present, then trailing slash, then add /index |
|
full_url="${full_url%/index}" |
|
full_url="${full_url%/}/index" |
|
|
|
local pdf_page_url="${full_url/html/pdf}" |
|
|
|
if [ -z "$pdf_page_url" ]; then |
|
return 0 |
|
fi |
|
|
|
# Generate filename from PDF page URL (doc name is the first path segment after /pdf/) |
|
local filename="${pdf_page_url#*/pdf/}" |
|
filename="${filename%%/*}.pdf" |
|
|
|
# Resolve the actual PDF download URL from the HTML page that /pdf/.../index now returns. |
|
# The site no longer serves the PDF directly at this URL; instead it embeds the real |
|
# .pdf URL in a JavaScript bundle within the HTML response. |
|
local pdf_url |
|
pdf_url=$(curl -sf "$pdf_page_url" | grep -oP '"https://docs\.redhat\.com[^"]*\.pdf"' | tr -d '"' | head -1) |
|
if [ -z "$pdf_url" ]; then |
|
echo "✗ Could not resolve PDF URL from: $pdf_page_url" >&2 |
|
return 1 |
|
fi |
|
log_verbose "Resolved PDF URL: $pdf_url" |
|
|
|
# Check updates mode |
|
if [ "$CHECK_UPDATES" -eq 1 ] && [ -f "$filename" ]; then |
|
log_verbose "Checking for updates: $filename" |
|
|
|
# Download to temp file |
|
local temp_file="${filename}.tmp.$$" |
|
|
|
if ! curl -sf -o "$temp_file" "$pdf_url"; then |
|
echo "✗ Failed to download: $filename (from $pdf_url)" >&2 |
|
rm -f "$temp_file" |
|
return 1 |
|
fi |
|
|
|
# Compare creation dates |
|
local local_date=$(get_pdf_date "$filename") |
|
local remote_date=$(get_pdf_date "$temp_file") |
|
|
|
if [ -z "$local_date" ] || [ -z "$remote_date" ]; then |
|
# Can't compare dates, replace with newer version |
|
log_verbose "⚠ Cannot compare dates, updating: $filename" |
|
mv "$temp_file" "$filename" |
|
# Set mtime to remote PDF creation date if available |
|
if [ -n "$remote_date" ]; then |
|
touch -d "@$remote_date" "$filename" 2>/dev/null || true |
|
fi |
|
log_verbose "✓ Updated: $filename" |
|
return 0 |
|
fi |
|
|
|
if [ "$remote_date" -gt "$local_date" ]; then |
|
# Remote is newer |
|
mv "$temp_file" "$filename" |
|
# Set mtime to PDF creation date |
|
touch -d "@$remote_date" "$filename" 2>/dev/null || true |
|
local local_date_str=$(date -d "@$local_date" "+%Y-%m-%d %H:%M:%S") |
|
local remote_date_str=$(date -d "@$remote_date" "+%Y-%m-%d %H:%M:%S") |
|
log_verbose "✓ Updated: $filename (local: $local_date_str, remote: $remote_date_str)" |
|
return 0 |
|
else |
|
# Local is same or newer |
|
rm -f "$temp_file" |
|
log_verbose "⊝ Up to date: $filename" |
|
return 0 |
|
fi |
|
fi |
|
|
|
# Download PDF |
|
log_verbose "Downloading: $filename" |
|
|
|
if curl -sf -o "$filename" "$pdf_url"; then |
|
# Set file mtime to PDF creation date if pdfinfo is available |
|
if command -v pdfinfo >/dev/null 2>&1; then |
|
local pdf_date=$(get_pdf_date "$filename") |
|
if [ -n "$pdf_date" ]; then |
|
touch -d "@$pdf_date" "$filename" 2>/dev/null || true |
|
fi |
|
fi |
|
log_verbose "✓ Downloaded: $filename" |
|
return 0 |
|
else |
|
echo "✗ Failed to download: $filename (from $pdf_url)" >&2 |
|
return 1 |
|
fi |
|
} |
|
|
|
# Download PDFs with controlled parallelism |
|
declare -a PIDS=() |
|
FAILED=0 |
|
|
|
while IFS= read -r link; do |
|
# Wait if we've reached the parallel job limit |
|
while [ ${#PIDS[@]} -ge "$PARALLEL_JOBS" ]; do |
|
for i in "${!PIDS[@]}"; do |
|
pid="${PIDS[$i]}" |
|
if ! kill -0 "$pid" 2>/dev/null; then |
|
# Process completed, check exit status |
|
wait "$pid" |
|
if [ $? -ne 0 ]; then |
|
FAILED=$((FAILED + 1)) |
|
fi |
|
unset 'PIDS[$i]' |
|
fi |
|
done |
|
# Rebuild array to remove gaps |
|
PIDS=("${PIDS[@]}") |
|
|
|
# Brief sleep to avoid busy waiting |
|
if [ ${#PIDS[@]} -ge "$PARALLEL_JOBS" ]; then |
|
sleep 0.1 |
|
fi |
|
done |
|
|
|
# Start download in background |
|
download_pdf "$link" & |
|
PIDS+=($!) |
|
done <<< "$HTML_LINKS" |
|
|
|
# Wait for remaining jobs |
|
for pid in "${PIDS[@]}"; do |
|
wait "$pid" |
|
if [ $? -ne 0 ]; then |
|
FAILED=$((FAILED + 1)) |
|
fi |
|
done |
|
|
|
if [ $FAILED -eq 0 ]; then |
|
log_verbose "All downloads completed successfully" |
|
else |
|
log_error "$FAILED download(s) failed" |
|
exit 1 |
|
fi |