Skip to content

Instantly share code, notes, and snippets.

@datashaman
Last active July 2, 2025 12:49
Show Gist options
  • Save datashaman/2eeb2ec39f1375c8105caf634616d4da to your computer and use it in GitHub Desktop.
Save datashaman/2eeb2ec39f1375c8105caf634616d4da to your computer and use it in GitHub Desktop.
CLI to get a list of URLs from a sitemap
#!/usr/bin/env python
import sys
import json
import requests
import xml.etree.ElementTree as ET
def fetch_sitemap_urls(url: str, seen=None) -> list[str]:
if seen is None:
seen = set()
urls = []
try:
resp = requests.get(url, timeout=10)
resp.raise_for_status()
except Exception as e:
print(e)
return urls
seen.add(url)
root = ET.fromstring(resp.content)
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
# Check if it's a sitemap index
sitemap_tags = root.findall("sm:sitemap", ns)
if sitemap_tags:
for sitemap in sitemap_tags:
loc = sitemap.find("sm:loc", ns)
if loc is not None:
loc_url = loc.text.strip()
if loc_url not in seen:
urls.extend(fetch_sitemap_urls(loc_url, seen))
else:
# Regular sitemap
for url_tag in root.findall("sm:url", ns):
loc = url_tag.find("sm:loc", ns)
if loc is not None:
url = loc.text.strip()
urls.append(url)
print(url)
return urls
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} sitemap_url")
sys.exit(-1)
url = sys.argv[1]
print(json.dumps(fetch_sitemap_urls(url), indent=4))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment