Skip to content

Instantly share code, notes, and snippets.

@4383
Created April 16, 2025 15:12
Show Gist options
  • Save 4383/c126e3d67b34ef235e9048174f9f467c to your computer and use it in GitHub Desktop.
Save 4383/c126e3d67b34ef235e9048174f9f467c to your computer and use it in GitHub Desktop.
Download the content of all the OpenStack PTG etherpads in a raw format to allow AI agent processing their content and making automated reports
#!/bin/bash
# Script to download the content of all etherpads listed on https://ptg.opendev.org/etherpads.html
# This script uses photonos because the pages are built with JavaScript
# Default configuration
DEFAULT_PHOTONOS_BIN="photonos" # Default path assumes photonos is in PATH
PHOTONOS_BIN="$DEFAULT_PHOTONOS_BIN" # Path to photonos executable
DEFAULT_OUTPUT_DIR="etherpads_content"
ETHERPADS_INDEX="etherpads_index.html"
ETHERPADS_LIST="etherpads_list.txt"
ETHERPADS_URL="https://ptg.opendev.org/etherpads.html"
ENABLE_HTML=true # By default, HTML download is enabled
ENABLE_SCREENSHOTS=true # By default, screenshots are enabled
ENABLE_TEXT_EXPORT=true # By default, plain text export is enabled
# Display help
show_help() {
echo "Usage: $0 [-o|--output-dir DIRECTORY] [-p|--photonos PATH] [-h|--html] [-H|--no-html] [-s|--screenshots] [-n|--no-screenshots] [-t|--text] [-T|--no-text]"
echo
echo "Options:"
echo " -o, --output-dir DIRECTORY Specify output directory (default: $DEFAULT_OUTPUT_DIR)"
echo " -p, --photonos PATH Specify path to photonos executable (default: $DEFAULT_PHOTONOS_BIN)"
echo " -h, --html Enable HTML download (default)"
echo " -H, --no-html Disable HTML download"
echo " -s, --screenshots Enable screenshots (default)"
echo " -n, --no-screenshots Disable screenshots"
echo " -t, --text Enable plain text export (default)"
echo " -T, --no-text Disable plain text export"
echo " --help Display this help message"
exit 0
}
# Process arguments
OUTPUT_DIR="$DEFAULT_OUTPUT_DIR"
while [ "$#" -gt 0 ]; do
case "$1" in
-o|--output-dir)
if [ -n "$2" ]; then
OUTPUT_DIR="$2"
shift 2
else
echo "Error: Option $1 requires an argument."
exit 1
fi
;;
-p|--photonos)
if [ -n "$2" ]; then
PHOTONOS_BIN="$2"
shift 2
else
echo "Error: Option $1 requires an argument."
exit 1
fi
;;
-h|--html)
ENABLE_HTML=true
shift
;;
-H|--no-html)
ENABLE_HTML=false
shift
;;
-s|--screenshots)
ENABLE_SCREENSHOTS=true
shift
;;
-n|--no-screenshots)
ENABLE_SCREENSHOTS=false
shift
;;
-t|--text)
ENABLE_TEXT_EXPORT=true
shift
;;
-T|--no-text)
ENABLE_TEXT_EXPORT=false
shift
;;
--help)
show_help
;;
*)
echo "Unknown option: $1"
show_help
;;
esac
done
# Check if at least one format is enabled
if [ "$ENABLE_HTML" = false ] && [ "$ENABLE_SCREENSHOTS" = false ] && [ "$ENABLE_TEXT_EXPORT" = false ]; then
echo "❌ Error: All output formats are disabled. Enable at least one format (HTML, screenshots or text)."
exit 1
fi
# Reference to photonos documentation
PHOTONOS_DOC_URL="https://crates.io/crates/photonos"
# Check if photonos is available (only if HTML or screenshots are enabled)
if ([ "$ENABLE_HTML" = true ] || [ "$ENABLE_SCREENSHOTS" = true ]); then
# Try to get photonos version to check if it's available
if ! command -v "$PHOTONOS_BIN" &> /dev/null; then
echo "❌ photonos is not found in your PATH or at specified location: $PHOTONOS_BIN"
echo "Please make sure photonos is installed and in your PATH or specify the correct path using -p option."
echo "For more information about photonos installation and usage, please refer to the documentation at:"
echo " $PHOTONOS_DOC_URL"
exit 1
fi
fi
# Check if curl is installed for text exports
if [ "$ENABLE_TEXT_EXPORT" = true ] && ! command -v curl &> /dev/null; then
echo "⚠️ curl is not installed. Text export will not be available."
ENABLE_TEXT_EXPORT=false
fi
# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
echo "📋 Downloading etherpads index page..."
"$PHOTONOS_BIN" "$ETHERPADS_URL" -o "$OUTPUT_DIR/$ETHERPADS_INDEX"
if [ ! -f "$OUTPUT_DIR/$ETHERPADS_INDEX" ]; then
echo "❌ Failed to download etherpads index page"
echo "This might be due to an issue with photonos. For troubleshooting and more information, please refer to:"
echo " $PHOTONOS_DOC_URL"
exit 1
fi
echo "🔍 Extracting etherpad URLs..."
# Extract all etherpad URLs with grep and regex - improved version
grep -o 'https://etherpad\.opendev\.org/p/[^"<]*' "$OUTPUT_DIR/$ETHERPADS_INDEX" | sort | uniq > "$OUTPUT_DIR/$ETHERPADS_LIST"
# Additional cleaning to remove potentially captured HTML tags
sed -i 's/<[^>]*>//g' "$OUTPUT_DIR/$ETHERPADS_LIST"
# Count the number of etherpads found
ETHERPAD_COUNT=$(wc -l < "$OUTPUT_DIR/$ETHERPADS_LIST")
echo "🔢 $ETHERPAD_COUNT etherpads found"
if [ "$ETHERPAD_COUNT" -eq 0 ]; then
echo "❌ No etherpads found in the index page"
exit 1
fi
# Display configuration
echo "⚙️ Configuration:"
echo " - Photonos executable: $PHOTONOS_BIN"
echo " - Output directory: $OUTPUT_DIR"
echo " - HTML download: $([ "$ENABLE_HTML" = true ] && echo "Enabled" || echo "Disabled")"
echo " - Screenshots: $([ "$ENABLE_SCREENSHOTS" = true ] && echo "Enabled" || echo "Disabled")"
echo " - Plain text export: $([ "$ENABLE_TEXT_EXPORT" = true ] && echo "Enabled" || echo "Disabled")"
echo "⏳ Downloading content of each etherpad..."
# Function to convert a URL to a safe filename
sanitize_filename() {
echo "$1" | sed 's/https:\/\///g' | sed 's/\//_/g' | sed 's/:/_/g'
}
# Process each etherpad URL
counter=0
while read -r etherpad_url; do
counter=$((counter + 1))
# Generate a filename based on the URL
filename=$(sanitize_filename "$etherpad_url")
echo "[$counter/$ETHERPAD_COUNT] 📥 Downloading $etherpad_url"
# Download HTML content of the etherpad with photonos
if [ "$ENABLE_HTML" = true ]; then
output_html="$OUTPUT_DIR/${filename}.html"
if [ "$ENABLE_SCREENSHOTS" = true ]; then
output_screenshot="$OUTPUT_DIR/${filename}.png"
"$PHOTONOS_BIN" "$etherpad_url" -o "$output_html" --screenshot "$output_screenshot"
# Check if download was successful
if [ $? -eq 0 ]; then
echo " ✅ HTML saved to $output_html with screenshot"
else
echo " ⚠️ Failed to download HTML from $etherpad_url"
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL"
fi
else
"$PHOTONOS_BIN" "$etherpad_url" -o "$output_html"
# Check if download was successful
if [ $? -eq 0 ]; then
echo " ✅ HTML saved to $output_html"
else
echo " ⚠️ Failed to download HTML from $etherpad_url"
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL"
fi
fi
elif [ "$ENABLE_SCREENSHOTS" = true ]; then
# If HTML is disabled but screenshots are enabled
output_screenshot="$OUTPUT_DIR/${filename}.png"
"$PHOTONOS_BIN" "$etherpad_url" -o "/dev/null" --screenshot "$output_screenshot"
# Check if download was successful
if [ $? -eq 0 ]; then
echo " ✅ Screenshot saved to $output_screenshot"
else
echo " ⚠️ Failed to capture screenshot of $etherpad_url"
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL"
fi
fi
# Download the plain text version of the etherpad with curl
if [ "$ENABLE_TEXT_EXPORT" = true ]; then
output_text="$OUTPUT_DIR/${filename}.txt"
text_export_url="${etherpad_url}/export/txt"
echo " 📝 Downloading plain text version with curl..."
curl -L -s -o "$output_text" "$text_export_url"
# Check if text download was successful
if [ $? -eq 0 ] && [ -s "$output_text" ]; then
echo " ✅ Text version saved to $output_text"
else
echo " ⚠️ Failed to download text version from $etherpad_url"
# Remove empty file in case of failure
if [ -f "$output_text" ] && [ ! -s "$output_text" ]; then
rm "$output_text"
fi
fi
fi
# Small pause to avoid overloading the server
sleep 1
done < "$OUTPUT_DIR/$ETHERPADS_LIST"
echo "🎉 Download complete! All etherpads have been saved to $OUTPUT_DIR"
@4383
Copy link
Author

4383 commented Apr 16, 2025

Using this script require installing photonos: https://crates.io/crates/photonos/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment