Created
April 16, 2025 15:12
-
-
Save 4383/c126e3d67b34ef235e9048174f9f467c to your computer and use it in GitHub Desktop.
Download the content of all the OpenStack PTG etherpads in a raw format to allow AI agent processing their content and making automated reports
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Script to download the content of all etherpads listed on https://ptg.opendev.org/etherpads.html | |
# This script uses photonos because the pages are built with JavaScript | |
# Default configuration | |
DEFAULT_PHOTONOS_BIN="photonos" # Default path assumes photonos is in PATH | |
PHOTONOS_BIN="$DEFAULT_PHOTONOS_BIN" # Path to photonos executable | |
DEFAULT_OUTPUT_DIR="etherpads_content" | |
ETHERPADS_INDEX="etherpads_index.html" | |
ETHERPADS_LIST="etherpads_list.txt" | |
ETHERPADS_URL="https://ptg.opendev.org/etherpads.html" | |
ENABLE_HTML=true # By default, HTML download is enabled | |
ENABLE_SCREENSHOTS=true # By default, screenshots are enabled | |
ENABLE_TEXT_EXPORT=true # By default, plain text export is enabled | |
# Display help | |
show_help() { | |
echo "Usage: $0 [-o|--output-dir DIRECTORY] [-p|--photonos PATH] [-h|--html] [-H|--no-html] [-s|--screenshots] [-n|--no-screenshots] [-t|--text] [-T|--no-text]" | |
echo | |
echo "Options:" | |
echo " -o, --output-dir DIRECTORY Specify output directory (default: $DEFAULT_OUTPUT_DIR)" | |
echo " -p, --photonos PATH Specify path to photonos executable (default: $DEFAULT_PHOTONOS_BIN)" | |
echo " -h, --html Enable HTML download (default)" | |
echo " -H, --no-html Disable HTML download" | |
echo " -s, --screenshots Enable screenshots (default)" | |
echo " -n, --no-screenshots Disable screenshots" | |
echo " -t, --text Enable plain text export (default)" | |
echo " -T, --no-text Disable plain text export" | |
echo " --help Display this help message" | |
exit 0 | |
} | |
# Process arguments | |
OUTPUT_DIR="$DEFAULT_OUTPUT_DIR" | |
while [ "$#" -gt 0 ]; do | |
case "$1" in | |
-o|--output-dir) | |
if [ -n "$2" ]; then | |
OUTPUT_DIR="$2" | |
shift 2 | |
else | |
echo "Error: Option $1 requires an argument." | |
exit 1 | |
fi | |
;; | |
-p|--photonos) | |
if [ -n "$2" ]; then | |
PHOTONOS_BIN="$2" | |
shift 2 | |
else | |
echo "Error: Option $1 requires an argument." | |
exit 1 | |
fi | |
;; | |
-h|--html) | |
ENABLE_HTML=true | |
shift | |
;; | |
-H|--no-html) | |
ENABLE_HTML=false | |
shift | |
;; | |
-s|--screenshots) | |
ENABLE_SCREENSHOTS=true | |
shift | |
;; | |
-n|--no-screenshots) | |
ENABLE_SCREENSHOTS=false | |
shift | |
;; | |
-t|--text) | |
ENABLE_TEXT_EXPORT=true | |
shift | |
;; | |
-T|--no-text) | |
ENABLE_TEXT_EXPORT=false | |
shift | |
;; | |
--help) | |
show_help | |
;; | |
*) | |
echo "Unknown option: $1" | |
show_help | |
;; | |
esac | |
done | |
# Check if at least one format is enabled | |
if [ "$ENABLE_HTML" = false ] && [ "$ENABLE_SCREENSHOTS" = false ] && [ "$ENABLE_TEXT_EXPORT" = false ]; then | |
echo "❌ Error: All output formats are disabled. Enable at least one format (HTML, screenshots or text)." | |
exit 1 | |
fi | |
# Reference to photonos documentation | |
PHOTONOS_DOC_URL="https://crates.io/crates/photonos" | |
# Check if photonos is available (only if HTML or screenshots are enabled) | |
if ([ "$ENABLE_HTML" = true ] || [ "$ENABLE_SCREENSHOTS" = true ]); then | |
# Try to get photonos version to check if it's available | |
if ! command -v "$PHOTONOS_BIN" &> /dev/null; then | |
echo "❌ photonos is not found in your PATH or at specified location: $PHOTONOS_BIN" | |
echo "Please make sure photonos is installed and in your PATH or specify the correct path using -p option." | |
echo "For more information about photonos installation and usage, please refer to the documentation at:" | |
echo " $PHOTONOS_DOC_URL" | |
exit 1 | |
fi | |
fi | |
# Check if curl is installed for text exports | |
if [ "$ENABLE_TEXT_EXPORT" = true ] && ! command -v curl &> /dev/null; then | |
echo "⚠️ curl is not installed. Text export will not be available." | |
ENABLE_TEXT_EXPORT=false | |
fi | |
# Create output directory if it doesn't exist | |
mkdir -p "$OUTPUT_DIR" | |
echo "📋 Downloading etherpads index page..." | |
"$PHOTONOS_BIN" "$ETHERPADS_URL" -o "$OUTPUT_DIR/$ETHERPADS_INDEX" | |
if [ ! -f "$OUTPUT_DIR/$ETHERPADS_INDEX" ]; then | |
echo "❌ Failed to download etherpads index page" | |
echo "This might be due to an issue with photonos. For troubleshooting and more information, please refer to:" | |
echo " $PHOTONOS_DOC_URL" | |
exit 1 | |
fi | |
echo "🔍 Extracting etherpad URLs..." | |
# Extract all etherpad URLs with grep and regex - improved version | |
grep -o 'https://etherpad\.opendev\.org/p/[^"<]*' "$OUTPUT_DIR/$ETHERPADS_INDEX" | sort | uniq > "$OUTPUT_DIR/$ETHERPADS_LIST" | |
# Additional cleaning to remove potentially captured HTML tags | |
sed -i 's/<[^>]*>//g' "$OUTPUT_DIR/$ETHERPADS_LIST" | |
# Count the number of etherpads found | |
ETHERPAD_COUNT=$(wc -l < "$OUTPUT_DIR/$ETHERPADS_LIST") | |
echo "🔢 $ETHERPAD_COUNT etherpads found" | |
if [ "$ETHERPAD_COUNT" -eq 0 ]; then | |
echo "❌ No etherpads found in the index page" | |
exit 1 | |
fi | |
# Display configuration | |
echo "⚙️ Configuration:" | |
echo " - Photonos executable: $PHOTONOS_BIN" | |
echo " - Output directory: $OUTPUT_DIR" | |
echo " - HTML download: $([ "$ENABLE_HTML" = true ] && echo "Enabled" || echo "Disabled")" | |
echo " - Screenshots: $([ "$ENABLE_SCREENSHOTS" = true ] && echo "Enabled" || echo "Disabled")" | |
echo " - Plain text export: $([ "$ENABLE_TEXT_EXPORT" = true ] && echo "Enabled" || echo "Disabled")" | |
echo "⏳ Downloading content of each etherpad..." | |
# Function to convert a URL to a safe filename | |
sanitize_filename() { | |
echo "$1" | sed 's/https:\/\///g' | sed 's/\//_/g' | sed 's/:/_/g' | |
} | |
# Process each etherpad URL | |
counter=0 | |
while read -r etherpad_url; do | |
counter=$((counter + 1)) | |
# Generate a filename based on the URL | |
filename=$(sanitize_filename "$etherpad_url") | |
echo "[$counter/$ETHERPAD_COUNT] 📥 Downloading $etherpad_url" | |
# Download HTML content of the etherpad with photonos | |
if [ "$ENABLE_HTML" = true ]; then | |
output_html="$OUTPUT_DIR/${filename}.html" | |
if [ "$ENABLE_SCREENSHOTS" = true ]; then | |
output_screenshot="$OUTPUT_DIR/${filename}.png" | |
"$PHOTONOS_BIN" "$etherpad_url" -o "$output_html" --screenshot "$output_screenshot" | |
# Check if download was successful | |
if [ $? -eq 0 ]; then | |
echo " ✅ HTML saved to $output_html with screenshot" | |
else | |
echo " ⚠️ Failed to download HTML from $etherpad_url" | |
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL" | |
fi | |
else | |
"$PHOTONOS_BIN" "$etherpad_url" -o "$output_html" | |
# Check if download was successful | |
if [ $? -eq 0 ]; then | |
echo " ✅ HTML saved to $output_html" | |
else | |
echo " ⚠️ Failed to download HTML from $etherpad_url" | |
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL" | |
fi | |
fi | |
elif [ "$ENABLE_SCREENSHOTS" = true ]; then | |
# If HTML is disabled but screenshots are enabled | |
output_screenshot="$OUTPUT_DIR/${filename}.png" | |
"$PHOTONOS_BIN" "$etherpad_url" -o "/dev/null" --screenshot "$output_screenshot" | |
# Check if download was successful | |
if [ $? -eq 0 ]; then | |
echo " ✅ Screenshot saved to $output_screenshot" | |
else | |
echo " ⚠️ Failed to capture screenshot of $etherpad_url" | |
echo " For troubleshooting photonos issues, please refer to: $PHOTONOS_DOC_URL" | |
fi | |
fi | |
# Download the plain text version of the etherpad with curl | |
if [ "$ENABLE_TEXT_EXPORT" = true ]; then | |
output_text="$OUTPUT_DIR/${filename}.txt" | |
text_export_url="${etherpad_url}/export/txt" | |
echo " 📝 Downloading plain text version with curl..." | |
curl -L -s -o "$output_text" "$text_export_url" | |
# Check if text download was successful | |
if [ $? -eq 0 ] && [ -s "$output_text" ]; then | |
echo " ✅ Text version saved to $output_text" | |
else | |
echo " ⚠️ Failed to download text version from $etherpad_url" | |
# Remove empty file in case of failure | |
if [ -f "$output_text" ] && [ ! -s "$output_text" ]; then | |
rm "$output_text" | |
fi | |
fi | |
fi | |
# Small pause to avoid overloading the server | |
sleep 1 | |
done < "$OUTPUT_DIR/$ETHERPADS_LIST" | |
echo "🎉 Download complete! All etherpads have been saved to $OUTPUT_DIR" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Using this script require installing photonos: https://crates.io/crates/photonos/