Created
July 15, 2025 11:22
-
-
Save chitreshkakwani/a7391cd85802de44f35270f160bdc61b to your computer and use it in GitHub Desktop.
Script to download Wikipedia articles from a category
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Check if a category name was provided | |
| if [ -z "$1" ]; then | |
| echo "Usage: $0 <Wikipedia_Category_Name>" | |
| echo "Example: $0 Physics" | |
| exit 1 | |
| fi | |
| CATEGORY_NAME=$1 | |
| OUTPUT_DIR="${CATEGORY_NAME}" | |
| # Create the output directory if it doesn't exist | |
| mkdir -p "$OUTPUT_DIR" | |
| echo "Fetching page URLs from category: $CATEGORY_NAME" | |
| echo "Saving pages to: $OUTPUT_DIR" | |
| # Fetch URLs for pages in the specified category, excluding subcategories and portals | |
| # Use 'explaintext=true' to get plain text content | |
| # For large categories, you might need to handle pagination (cmcontinue parameter) | |
| # This example fetches up to 500 pages (cmlimit=500) | |
| curl -s "https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:${CATEGORY_NAME}&format=json&cmlimit=500&cmtype=page" | \ | |
| jq -r '.query.categorymembers[] | select(.ns == 0) | "\(.title)\t\(.pageid)"' | \ | |
| while IFS=$'\t' read -r title pageid; do | |
| # Create a safe filename from the title | |
| # Replace spaces with underscores, remove invalid characters | |
| SAFE_TITLE=$(echo "$title" | sed 's/ /_/g; s/[^a-zA-Z0-9_-]//g') | |
| FILE_PATH="${OUTPUT_DIR}/${SAFE_TITLE}.txt" | |
| echo "Downloading page: $title (ID: $pageid)" | |
| # Download the page content (plain text) | |
| # Using 'prop=extracts' with 'explaintext=true' for plain text content | |
| curl -s "https://en.wikipedia.org/w/api.php?action=query&pageids=${pageid}&prop=extracts&explaintext=true&format=json" | \ | |
| jq -r '.query.pages."'$pageid'".extract' > "$FILE_PATH" | |
| #curl -s "https://wikitext.eluni.co/api/extract?url=https%3A%2F%2Fen.wikipedia.org%2Fwiki?curid=${pageid}&format=text" > "$FILE_PATH" | |
| if [ $? -eq 0 ]; then | |
| echo "Saved to: $FILE_PATH" | |
| else | |
| echo "Failed to download: $title" | |
| fi | |
| done | |
| echo "Download complete for category: $CATEGORY_NAME" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment