Skip to content

Instantly share code, notes, and snippets.

@jkmartindale
Created April 20, 2022 20:07
Show Gist options
  • Save jkmartindale/8ca3a42d9ad6fd6849521c6e619fb723 to your computer and use it in GitHub Desktop.
Save jkmartindale/8ca3a42d9ad6fd6849521c6e619fb723 to your computer and use it in GitHub Desktop.
"""Sitemap redirect checker
Scans a sitemap to list three groups of URLs:
- Insecure redirects (same or different origin)
- Secure external redirects (safe and typically out of scope)
- Regular URLs (same origin, redirects resolved, in scope)
"""
from bs4 import BeautifulSoup
import progressbar
import requests
import sys
from urllib.parse import urljoin, urlparse
argc = len(sys.argv)
if argc == 1:
print("Provide a sitemap URL to examine")
exit()
if argc > 2:
print("Ignoring everything after the first argument")
sitemap_url = sys.argv[1]
sitemap_origin = urlparse(sitemap_url).netloc
if len(sitemap_origin) == 0:
print("Provide a valid absolute URL for the sitemap")
exit()
# WAF bypass
session = requests.Session()
session.headers.update({
'User-Agent': ''
})
insecure_redirects = set()
secure_external_redirects = set()
regular_urls = set()
sitemap = session.get(sitemap_url).text
urls = [tag.loc.text for tag in BeautifulSoup(sitemap, 'xml')('url')]
try:
for url in progressbar.progressbar(urls, redirect_stdout=True):
try:
response = session.get(url, allow_redirects=False)
except requests.exceptions.MissingSchema:
print(f"Invalid URL: '{url}'")
continue
redirect = response.headers.get('location')
if not redirect:
regular_urls.add(url)
continue
parsed_redirect = urlparse(redirect)
# Relative URLs are fine but we only care about the destination
if parsed_redirect.netloc == '':
regular_urls.add(urljoin(sitemap_url, redirect))
continue
# Note insecure redirects
if redirect and len(urlparse(redirect).netloc) > 0 and not redirect.startswith('https'):
insecure_redirects.add(url)
# Note redirects off the origin
elif urlparse(redirect).netloc != sitemap_origin:
secure_external_redirects.add(url)
# Secure redirects to the same host should be resolved to the redirect
else:
regular_urls.set(redirect)
# If early termination, print the data already gathered
except KeyboardInterrupt:
pass
print("\nInsecure redirects:", *sorted(insecure_redirects), sep="\n")
print("\nSecure external redirects:", *sorted(secure_external_redirects), sep="\n")
print("\nRegular URLs:", *sorted(regular_urls), sep="\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment