Created
March 18, 2026 20:02
-
-
Save lucasrangit/b7bf0c47d7e67e7e1f8fe240440e0a14 to your computer and use it in GitHub Desktop.
Script to download Xingbao building block instructions from http://code6.centoooo.cn/xingbao/index.aspx as PDF (e.g. Mirage Tank products).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Download Xingbao building block product instructions. | |
| set -euo pipefail | |
| BASE_URL="http://code6.centoooo.cn" | |
| LIST_URL_BASE="${BASE_URL}/xingbao/ProductList-P" | |
| if command -v img2pdf &> /dev/null; then | |
| PDF_TOOL="img2pdf" | |
| elif command -v convert &> /dev/null; then | |
| PDF_TOOL="convert" | |
| else | |
| echo "Error: Neither 'img2pdf' nor 'convert' found." | |
| exit 1 | |
| fi | |
| # Currently there are 3 pages with a total of 56 products. | |
| for page in {1..3}; do | |
| html=$(curl -sL "${LIST_URL_BASE}${page}.aspx") | |
| # Match the 'href' and 'title' attributes from the product list links | |
| echo "$html" | grep -o 'href="ProductShow-[0-9]*.aspx"[^>]*title="[^"]*"' | while read -r match; do | |
| # Extract relative URL (e.g. ProductShow-1558.aspx) | |
| rel_url=$(echo "$match" | sed 's/.*href="\([^"]*\)".*/\1/') | |
| # Extract clean title text (e.g. "XB-13002HT") | |
| title=$(echo "$match" | sed 's/.*title="\([^"]*\)".*/\1/' | tr -d ' ') | |
| full_url="${BASE_URL}/xingbao/${rel_url}" | |
| filename="${title}.pdf" | |
| echo "Processing ${title} from ${full_url}..." | |
| prod_html=$(curl -sL "${full_url}") | |
| # Extract cover image from the detail_pic section | |
| cover_url=$(echo "$prod_html" | sed -n '/class="detail_pic/,/<\/div>/p' | grep -o 'src="[^"]*"' | sed 's/src="//;s/"//' | head -n 1) | |
| # Extract image URLs from within the <div id="cnRemark"> section | |
| img_urls=$(echo "$prod_html" | sed -n '/id="cnRemark"/,/<\/div>/p' | grep -o 'src="[^"]*"' | sed 's/src="//;s/"//') | |
| if [ -z "$cover_url" ]; then | |
| echo " No cover image found. Skipping." | |
| continue | |
| fi | |
| if [ -z "$img_urls" ]; then | |
| echo " No instruction images found. Skipping." | |
| continue | |
| fi | |
| if [ -n "$cover_url" ]; then | |
| img_urls="$cover_url $img_urls" | |
| fi | |
| temp_dir=$(mktemp -d) | |
| img_list="" | |
| count=1 | |
| for img_path in $img_urls; do | |
| # Normalize path to handle ../ or absolute paths | |
| img_path=$(echo "$img_path" | sed 's|^\.\.||' | sed 's|^\.||') | |
| [[ "$img_path" != /* ]] && img_path="/$img_path" | |
| img_url="${BASE_URL}${img_path}" | |
| # Ensure proper ordering by zero-padding the filename | |
| img_file="${temp_dir}/img_$(printf "%03d" $count).jpg" | |
| echo " Downloading image $count: $img_url" | |
| curl -sL "$img_url" -o "$img_file" | |
| img_list="$img_list $img_file" | |
| count=$((count + 1)) | |
| done | |
| echo " Converting $(($count - 1)) images to PDF: ${filename}..." | |
| if [ "$PDF_TOOL" = "img2pdf" ]; then | |
| img2pdf $img_list -o "$filename" | |
| else | |
| convert $img_list "$filename" | |
| fi | |
| rm -rf "$temp_dir" | |
| echo " Done: ${filename}" | |
| done | |
| done | |
| exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment