chitreshkakwani · July 15, 2025 11:22
diff --git a/wikicategory.sh b/wikicategory.sh
 #!/bin/bash

 # Check if a category name was provided
 if [ -z "$1" ]; then
    echo "Usage: $0 <Wikipedia_Category_Name>"
    echo "Example: $0 Physics"
    exit 1
 fi

 CATEGORY_NAME=$1
 OUTPUT_DIR="${CATEGORY_NAME}"

 # Create the output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"

 echo "Fetching page URLs from category: $CATEGORY_NAME"
 echo "Saving pages to: $OUTPUT_DIR"

 # Fetch URLs for pages in the specified category, excluding subcategories and portals
 # Use 'explaintext=true' to get plain text content
 # For large categories, you might need to handle pagination (cmcontinue parameter)
 # This example fetches up to 500 pages (cmlimit=500)
 curl -s "https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:${CATEGORY_NAME}&format=json&cmlimit=500&cmtype=page" | \
 jq -r '.query.categorymembers[] | select(.ns == 0) | "\(.title)\t\(.pageid)"' | \
 while IFS=$'\t' read -r title pageid; do
    # Create a safe filename from the title
    # Replace spaces with underscores, remove invalid characters
    SAFE_TITLE=$(echo "$title" | sed 's/ /_/g; s/[^a-zA-Z0-9_-]//g')
    FILE_PATH="${OUTPUT_DIR}/${SAFE_TITLE}.txt"

    echo "Downloading page: $title (ID: $pageid)"

    # Download the page content (plain text)
    # Using 'prop=extracts' with 'explaintext=true' for plain text content
    curl -s "https://en.wikipedia.org/w/api.php?action=query&pageids=${pageid}&prop=extracts&explaintext=true&format=json" | \
    jq -r '.query.pages."'$pageid'".extract' > "$FILE_PATH"

    #curl -s "https://wikitext.eluni.co/api/extract?url=https%3A%2F%2Fen.wikipedia.org%2Fwiki?curid=${pageid}&format=text" > "$FILE_PATH"

    if [ $? -eq 0 ]; then
        echo "Saved to: $FILE_PATH"
    else
        echo "Failed to download: $title"
    fi
 done

 echo "Download complete for category: $CATEGORY_NAME"
	#!/bin/bash

	# Check if a category name was provided
	if [ -z "$1" ]; then
	echo "Usage: $0 <Wikipedia_Category_Name>"
	echo "Example: $0 Physics"
	exit 1
	fi

	CATEGORY_NAME=$1
	OUTPUT_DIR="${CATEGORY_NAME}"

	# Create the output directory if it doesn't exist
	mkdir -p "$OUTPUT_DIR"

	echo "Fetching page URLs from category: $CATEGORY_NAME"
	echo "Saving pages to: $OUTPUT_DIR"

	# Fetch URLs for pages in the specified category, excluding subcategories and portals
	# Use 'explaintext=true' to get plain text content
	# For large categories, you might need to handle pagination (cmcontinue parameter)
	# This example fetches up to 500 pages (cmlimit=500)
	curl -s "https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:${CATEGORY_NAME}&format=json&cmlimit=500&cmtype=page" \| \
	jq -r '.query.categorymembers[] \| select(.ns == 0) \| "\(.title)\t\(.pageid)"' \| \
	while IFS=$'\t' read -r title pageid; do
	# Create a safe filename from the title
	# Replace spaces with underscores, remove invalid characters
	SAFE_TITLE=$(echo "$title" \| sed 's/ /_/g; s/[^a-zA-Z0-9_-]//g')
	FILE_PATH="${OUTPUT_DIR}/${SAFE_TITLE}.txt"

	echo "Downloading page: $title (ID: $pageid)"

	# Download the page content (plain text)
	# Using 'prop=extracts' with 'explaintext=true' for plain text content
	curl -s "https://en.wikipedia.org/w/api.php?action=query&pageids=${pageid}&prop=extracts&explaintext=true&format=json" \| \
	jq -r '.query.pages."'$pageid'".extract' > "$FILE_PATH"

	#curl -s "https://wikitext.eluni.co/api/extract?url=https%3A%2F%2Fen.wikipedia.org%2Fwiki?curid=${pageid}&format=text" > "$FILE_PATH"

	if [ $? -eq 0 ]; then
	echo "Saved to: $FILE_PATH"
	else
	echo "Failed to download: $title"
	fi
	done

	echo "Download complete for category: $CATEGORY_NAME"