Created
May 31, 2023 20:57
-
-
Save pkdavies/40836b343db5de3f69469f016f055abb to your computer and use it in GitHub Desktop.
Extract URLs from a sitemap.xml
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from xml.etree import ElementTree as ET | |
import random | |
def extract_urls(sitemap_url, urls=None): | |
if urls is None: | |
urls = [] | |
well_known_user_agents = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' | |
] | |
headers = { | |
'User-Agent': random.choice(well_known_user_agents) | |
} | |
# Fetch the sitemap content | |
response = requests.get(sitemap_url, headers=headers) | |
response.raise_for_status() | |
sitemap_xml = response.content | |
# Parse the sitemap XML | |
root = ET.fromstring(sitemap_xml) | |
for elem in root.iter(): | |
# Extract URLs or Sitemap Index URLs | |
if elem.tag.endswith("sitemap"): | |
loc = elem.find("{*}loc") | |
if loc is not None: | |
extract_urls(loc.text, urls) | |
elif elem.tag.endswith("loc"): | |
urls.append(elem.text) | |
return urls | |
# Replace with your sitemap URL | |
sitemap_url = "https://example.com/sitemap.xml" | |
urls = extract_urls(sitemap_url) | |
print("Extracted URLs:") | |
for url in urls: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment