Last active
July 2, 2025 12:49
-
-
Save datashaman/2eeb2ec39f1375c8105caf634616d4da to your computer and use it in GitHub Desktop.
CLI to get a list of URLs from a sitemap
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import json | |
import requests | |
import xml.etree.ElementTree as ET | |
def fetch_sitemap_urls(url: str, seen=None) -> list[str]: | |
if seen is None: | |
seen = set() | |
urls = [] | |
try: | |
resp = requests.get(url, timeout=10) | |
resp.raise_for_status() | |
except Exception as e: | |
print(e) | |
return urls | |
seen.add(url) | |
root = ET.fromstring(resp.content) | |
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"} | |
# Check if it's a sitemap index | |
sitemap_tags = root.findall("sm:sitemap", ns) | |
if sitemap_tags: | |
for sitemap in sitemap_tags: | |
loc = sitemap.find("sm:loc", ns) | |
if loc is not None: | |
loc_url = loc.text.strip() | |
if loc_url not in seen: | |
urls.extend(fetch_sitemap_urls(loc_url, seen)) | |
else: | |
# Regular sitemap | |
for url_tag in root.findall("sm:url", ns): | |
loc = url_tag.find("sm:loc", ns) | |
if loc is not None: | |
url = loc.text.strip() | |
urls.append(url) | |
print(url) | |
return urls | |
if len(sys.argv) < 2: | |
print(f"Usage: {sys.argv[0]} sitemap_url") | |
sys.exit(-1) | |
url = sys.argv[1] | |
print(json.dumps(fetch_sitemap_urls(url), indent=4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment