lucasrangit · March 18, 2026 20:02
diff --git a/download_xingbao_building_block_instructions.sh b/download_xingbao_building_block_instructions.sh
 #!/usr/bin/env bash
 # Download Xingbao building block product instructions.
 set -euo pipefail

 BASE_URL="http://code6.centoooo.cn"
 LIST_URL_BASE="${BASE_URL}/xingbao/ProductList-P"

 if command -v img2pdf &> /dev/null; then
    PDF_TOOL="img2pdf"
 elif command -v convert &> /dev/null; then
    PDF_TOOL="convert"
 else
    echo "Error: Neither 'img2pdf' nor 'convert' found."
    exit 1
 fi

 # Currently there are 3 pages with a total of 56 products.
 for page in {1..3}; do
    html=$(curl -sL "${LIST_URL_BASE}${page}.aspx")

    # Match the 'href' and 'title' attributes from the product list links
    echo "$html" | grep -o 'href="ProductShow-[0-9]*.aspx"[^>]*title="[^"]*"' | while read -r match; do
        # Extract relative URL (e.g. ProductShow-1558.aspx)
        rel_url=$(echo "$match" | sed 's/.*href="\([^"]*\)".*/\1/')

        # Extract clean title text (e.g. "XB-13002HT")
        title=$(echo "$match" | sed 's/.*title="\([^"]*\)".*/\1/' | tr -d ' ')

        full_url="${BASE_URL}/xingbao/${rel_url}"
        filename="${title}.pdf"

        echo "Processing ${title} from ${full_url}..."

        prod_html=$(curl -sL "${full_url}")

        # Extract cover image from the detail_pic section
        cover_url=$(echo "$prod_html" | sed -n '/class="detail_pic/,/<\/div>/p' | grep -o 'src="[^"]*"' | sed 's/src="//;s/"//' | head -n 1)

        # Extract image URLs from within the <div id="cnRemark"> section
        img_urls=$(echo "$prod_html" | sed -n '/id="cnRemark"/,/<\/div>/p' | grep -o 'src="[^"]*"' | sed 's/src="//;s/"//')

        if [ -z "$cover_url" ]; then
            echo "  No cover image found. Skipping."
            continue 
        fi

        if [ -z "$img_urls" ]; then
            echo "  No instruction images found. Skipping."
            continue
        fi

        if [ -n "$cover_url" ]; then
            img_urls="$cover_url $img_urls"
        fi

        temp_dir=$(mktemp -d)
        img_list=""
        count=1

        for img_path in $img_urls; do
            # Normalize path to handle ../ or absolute paths
            img_path=$(echo "$img_path" | sed 's|^\.\.||' | sed 's|^\.||')
            [[ "$img_path" != /* ]] && img_path="/$img_path"

            img_url="${BASE_URL}${img_path}"
            # Ensure proper ordering by zero-padding the filename
            img_file="${temp_dir}/img_$(printf "%03d" $count).jpg"

            echo "  Downloading image $count: $img_url"
            curl -sL "$img_url" -o "$img_file"
            
            img_list="$img_list $img_file"
            count=$((count + 1))
        done
        
        echo "  Converting $(($count - 1)) images to PDF: ${filename}..."
        if [ "$PDF_TOOL" = "img2pdf" ]; then
            img2pdf $img_list -o "$filename"
        else
            convert $img_list "$filename"
        fi
        
        rm -rf "$temp_dir"
        echo "  Done: ${filename}"
    done
 done

 exit 0
	#!/usr/bin/env bash
	# Download Xingbao building block product instructions.
	set -euo pipefail

	BASE_URL="http://code6.centoooo.cn"
	LIST_URL_BASE="${BASE_URL}/xingbao/ProductList-P"

	if command -v img2pdf &> /dev/null; then
	PDF_TOOL="img2pdf"
	elif command -v convert &> /dev/null; then
	PDF_TOOL="convert"
	else
	echo "Error: Neither 'img2pdf' nor 'convert' found."
	exit 1
	fi

	# Currently there are 3 pages with a total of 56 products.
	for page in {1..3}; do
	html=$(curl -sL "${LIST_URL_BASE}${page}.aspx")

	# Match the 'href' and 'title' attributes from the product list links
	echo "$html" \| grep -o 'href="ProductShow-[0-9].aspx"[^>]title="[^"]*"' \| while read -r match; do
	# Extract relative URL (e.g. ProductShow-1558.aspx)
	rel_url=$(echo "$match" \| sed 's/.href="\([^"]\)".*/\1/')

	# Extract clean title text (e.g. "XB-13002HT")
	title=$(echo "$match" \| sed 's/.title="\([^"]\)".*/\1/' \| tr -d ' ')

	full_url="${BASE_URL}/xingbao/${rel_url}"
	filename="${title}.pdf"

	echo "Processing ${title} from ${full_url}..."

	prod_html=$(curl -sL "${full_url}")

	# Extract cover image from the detail_pic section
	cover_url=$(echo "$prod_html" \| sed -n '/class="detail_pic/,/<\/div>/p' \| grep -o 'src="[^"]*"' \| sed 's/src="//;s/"//' \| head -n 1)

	# Extract image URLs from within the <div id="cnRemark"> section
	img_urls=$(echo "$prod_html" \| sed -n '/id="cnRemark"/,/<\/div>/p' \| grep -o 'src="[^"]*"' \| sed 's/src="//;s/"//')

	if [ -z "$cover_url" ]; then
	echo " No cover image found. Skipping."
	continue
	fi

	if [ -z "$img_urls" ]; then
	echo " No instruction images found. Skipping."
	continue
	fi

	if [ -n "$cover_url" ]; then
	img_urls="$cover_url $img_urls"
	fi

	temp_dir=$(mktemp -d)
	img_list=""
	count=1

	for img_path in $img_urls; do
	# Normalize path to handle ../ or absolute paths
	img_path=$(echo "$img_path" \| sed 's\|^\.\.\|\|' \| sed 's\|^\.\|\|')
	[[ "$img_path" != /* ]] && img_path="/$img_path"

	img_url="${BASE_URL}${img_path}"
	# Ensure proper ordering by zero-padding the filename
	img_file="${temp_dir}/img_$(printf "%03d" $count).jpg"

	echo " Downloading image $count: $img_url"
	curl -sL "$img_url" -o "$img_file"

	img_list="$img_list $img_file"
	count=$((count + 1))
	done

	echo " Converting $(($count - 1)) images to PDF: ${filename}..."
	if [ "$PDF_TOOL" = "img2pdf" ]; then
	img2pdf $img_list -o "$filename"
	else
	convert $img_list "$filename"
	fi

	rm -rf "$temp_dir"
	echo " Done: ${filename}"
	done
	done

	exit 0
No results found