Skip to content

Instantly share code, notes, and snippets.

@moriwaka
Last active May 13, 2026 06:14
Show Gist options
  • Select an option

  • Save moriwaka/55ff6cb8b43655708e99283f6aa886ac to your computer and use it in GitHub Desktop.

Select an option

Save moriwaka/55ff6cb8b43655708e99283f6aa886ac to your computer and use it in GitHub Desktop.
PDF Document Downloader from docs.redhat.com

Usage:

```
$ mkdir RHEL9Doc
$ cd RHEL9Doc
$ fetchdoc.sh https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9
```

Prerequirement: curl

  • 2024-11-21: fix for docs.redhat.com update
  • 2026-05-13: fix for docs.redhat.com update, remove GNU parallel dependency, and more fixes
#!/bin/bash
#
# fetchdoc - Download Red Hat documentation PDFs from docs.redhat.com
# Version: 1.0.0
#
# Downloads PDF versions of documentation by scraping HTML documentation
# index pages and converting links to PDF format.
set -e
set -o pipefail
VERSION="1.0.0"
PARALLEL_JOBS=10
VERBOSE=0
CHECK_UPDATES=0
usage() {
cat << EOF
Usage: $(basename "$0") [OPTIONS] <URL>
Download Red Hat documentation PDFs from a documentation index page.
OPTIONS:
-h, --help Show this help message
-v, --verbose Show detailed download progress
-p, --parallel NUM Number of parallel downloads (default: 10)
-u, --check-updates Only download if remote PDF is newer than local
--version Show version information
EXAMPLES:
# Download RHEL 9 documentation
$(basename "$0") https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9
# Download with verbose output and 5 parallel jobs
$(basename "$0") -v -p 5 https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9
# Only download updated PDFs (checks creation dates)
$(basename "$0") -u https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9
EOF
exit 0
}
show_version() {
echo "fetchdoc version $VERSION"
exit 0
}
log_verbose() {
if [ "$VERBOSE" -eq 1 ]; then
echo "$@"
fi
}
log_error() {
echo "ERROR: $@" >&2
}
# Parse options
while [ $# -gt 0 ]; do
case "$1" in
-h|--help)
usage
;;
--version)
show_version
;;
-v|--verbose)
VERBOSE=1
shift
;;
-p|--parallel)
PARALLEL_JOBS="$2"
if ! [[ "$PARALLEL_JOBS" =~ ^[0-9]+$ ]] || [ "$PARALLEL_JOBS" -lt 1 ]; then
log_error "Parallel jobs must be a positive integer"
exit 1
fi
shift 2
;;
-u|--check-updates)
CHECK_UPDATES=1
shift
;;
-*)
log_error "Unknown option: $1"
echo "Use -h or --help for usage information"
exit 1
;;
*)
break
;;
esac
done
# Check required argument
if [ -z "$1" ]; then
log_error "Missing required URL argument"
echo "Use -h or --help for usage information"
exit 1
fi
URL="$1"
# Validate URL format
if ! [[ "$URL" =~ ^https?:// ]]; then
log_error "Invalid URL format: $URL"
exit 1
fi
# Check dependencies
for cmd in curl grep awk; do
if ! command -v "$cmd" >/dev/null 2>&1; then
log_error "Required command not found: $cmd"
exit 1
fi
done
# Check pdfinfo if update checking is enabled
if [ "$CHECK_UPDATES" -eq 1 ]; then
if ! command -v pdfinfo >/dev/null 2>&1; then
log_error "pdfinfo is required for --check-updates (install poppler-utils)"
exit 1
fi
fi
# Extract base URL and documentation path
BASE_URL=$(echo "$URL" | awk -F/ '{print $1"//"$3}')
# Extract the product/version path (e.g., /en/documentation/red_hat_enterprise_linux/9)
DOC_PATH=$(echo "$URL" | sed 's|^https\?://[^/]*/||' | sed 's|/\?$||')
log_verbose "Fetching documentation index from: $URL"
log_verbose "Base URL: $BASE_URL"
log_verbose "Documentation path: $DOC_PATH"
log_verbose "Parallel jobs: $PARALLEL_JOBS"
# Download index page and extract documentation links
INDEX_PAGE=$(curl -sf "$URL")
if [ $? -ne 0 ]; then
log_error "Failed to fetch index page: $URL"
exit 1
fi
# Extract /html/ links and filter to only include links matching the documentation path
# This prevents downloading cross-references to other versions
HTML_LINKS=$(echo "$INDEX_PAGE" | grep -oP '(?<=href=")[^"]*' | grep '/html/' | grep "^/$DOC_PATH/" || true)
if [ -z "$HTML_LINKS" ]; then
log_error "No documentation links found at: $URL"
exit 1
fi
LINK_COUNT=$(echo "$HTML_LINKS" | wc -l)
log_verbose "Found $LINK_COUNT documentation links"
# Get PDF creation date from metadata
get_pdf_date() {
local pdf_file="$1"
if [ ! -f "$pdf_file" ]; then
echo ""
return 1
fi
# Extract CreationDate from PDF metadata
local date_str=$(pdfinfo "$pdf_file" 2>/dev/null | grep "^CreationDate:" | sed 's/CreationDate:[[:space:]]*//')
if [ -z "$date_str" ]; then
echo ""
return 1
fi
# Convert to Unix timestamp for comparison
# Format: "Wed Mar 11 17:00:08 2026 JST" or similar
date -d "$date_str" +%s 2>/dev/null || echo ""
}
# Download function for background jobs
download_pdf() {
local relative_url="$1"
local full_url="${BASE_URL}${relative_url}"
# Remove trailing /index if present, then trailing slash, then add /index
full_url="${full_url%/index}"
full_url="${full_url%/}/index"
local pdf_page_url="${full_url/html/pdf}"
if [ -z "$pdf_page_url" ]; then
return 0
fi
# Generate filename from PDF page URL (doc name is the first path segment after /pdf/)
local filename="${pdf_page_url#*/pdf/}"
filename="${filename%%/*}.pdf"
# Resolve the actual PDF download URL from the HTML page that /pdf/.../index now returns.
# The site no longer serves the PDF directly at this URL; instead it embeds the real
# .pdf URL in a JavaScript bundle within the HTML response.
local pdf_url
pdf_url=$(curl -sf "$pdf_page_url" | grep -oP '"https://docs\.redhat\.com[^"]*\.pdf"' | tr -d '"' | head -1)
if [ -z "$pdf_url" ]; then
echo "✗ Could not resolve PDF URL from: $pdf_page_url" >&2
return 1
fi
log_verbose "Resolved PDF URL: $pdf_url"
# Check updates mode
if [ "$CHECK_UPDATES" -eq 1 ] && [ -f "$filename" ]; then
log_verbose "Checking for updates: $filename"
# Download to temp file
local temp_file="${filename}.tmp.$$"
if ! curl -sf -o "$temp_file" "$pdf_url"; then
echo "✗ Failed to download: $filename (from $pdf_url)" >&2
rm -f "$temp_file"
return 1
fi
# Compare creation dates
local local_date=$(get_pdf_date "$filename")
local remote_date=$(get_pdf_date "$temp_file")
if [ -z "$local_date" ] || [ -z "$remote_date" ]; then
# Can't compare dates, replace with newer version
log_verbose "⚠ Cannot compare dates, updating: $filename"
mv "$temp_file" "$filename"
# Set mtime to remote PDF creation date if available
if [ -n "$remote_date" ]; then
touch -d "@$remote_date" "$filename" 2>/dev/null || true
fi
log_verbose "✓ Updated: $filename"
return 0
fi
if [ "$remote_date" -gt "$local_date" ]; then
# Remote is newer
mv "$temp_file" "$filename"
# Set mtime to PDF creation date
touch -d "@$remote_date" "$filename" 2>/dev/null || true
local local_date_str=$(date -d "@$local_date" "+%Y-%m-%d %H:%M:%S")
local remote_date_str=$(date -d "@$remote_date" "+%Y-%m-%d %H:%M:%S")
log_verbose "✓ Updated: $filename (local: $local_date_str, remote: $remote_date_str)"
return 0
else
# Local is same or newer
rm -f "$temp_file"
log_verbose "⊝ Up to date: $filename"
return 0
fi
fi
# Download PDF
log_verbose "Downloading: $filename"
if curl -sf -o "$filename" "$pdf_url"; then
# Set file mtime to PDF creation date if pdfinfo is available
if command -v pdfinfo >/dev/null 2>&1; then
local pdf_date=$(get_pdf_date "$filename")
if [ -n "$pdf_date" ]; then
touch -d "@$pdf_date" "$filename" 2>/dev/null || true
fi
fi
log_verbose "✓ Downloaded: $filename"
return 0
else
echo "✗ Failed to download: $filename (from $pdf_url)" >&2
return 1
fi
}
# Download PDFs with controlled parallelism
declare -a PIDS=()
FAILED=0
while IFS= read -r link; do
# Wait if we've reached the parallel job limit
while [ ${#PIDS[@]} -ge "$PARALLEL_JOBS" ]; do
for i in "${!PIDS[@]}"; do
pid="${PIDS[$i]}"
if ! kill -0 "$pid" 2>/dev/null; then
# Process completed, check exit status
wait "$pid"
if [ $? -ne 0 ]; then
FAILED=$((FAILED + 1))
fi
unset 'PIDS[$i]'
fi
done
# Rebuild array to remove gaps
PIDS=("${PIDS[@]}")
# Brief sleep to avoid busy waiting
if [ ${#PIDS[@]} -ge "$PARALLEL_JOBS" ]; then
sleep 0.1
fi
done
# Start download in background
download_pdf "$link" &
PIDS+=($!)
done <<< "$HTML_LINKS"
# Wait for remaining jobs
for pid in "${PIDS[@]}"; do
wait "$pid"
if [ $? -ne 0 ]; then
FAILED=$((FAILED + 1))
fi
done
if [ $FAILED -eq 0 ]; then
log_verbose "All downloads completed successfully"
else
log_error "$FAILED download(s) failed"
exit 1
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment