Created
April 20, 2022 20:07
-
-
Save jkmartindale/8ca3a42d9ad6fd6849521c6e619fb723 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Sitemap redirect checker | |
Scans a sitemap to list three groups of URLs: | |
- Insecure redirects (same or different origin) | |
- Secure external redirects (safe and typically out of scope) | |
- Regular URLs (same origin, redirects resolved, in scope) | |
""" | |
from bs4 import BeautifulSoup | |
import progressbar | |
import requests | |
import sys | |
from urllib.parse import urljoin, urlparse | |
argc = len(sys.argv) | |
if argc == 1: | |
print("Provide a sitemap URL to examine") | |
exit() | |
if argc > 2: | |
print("Ignoring everything after the first argument") | |
sitemap_url = sys.argv[1] | |
sitemap_origin = urlparse(sitemap_url).netloc | |
if len(sitemap_origin) == 0: | |
print("Provide a valid absolute URL for the sitemap") | |
exit() | |
# WAF bypass | |
session = requests.Session() | |
session.headers.update({ | |
'User-Agent': '' | |
}) | |
insecure_redirects = set() | |
secure_external_redirects = set() | |
regular_urls = set() | |
sitemap = session.get(sitemap_url).text | |
urls = [tag.loc.text for tag in BeautifulSoup(sitemap, 'xml')('url')] | |
try: | |
for url in progressbar.progressbar(urls, redirect_stdout=True): | |
try: | |
response = session.get(url, allow_redirects=False) | |
except requests.exceptions.MissingSchema: | |
print(f"Invalid URL: '{url}'") | |
continue | |
redirect = response.headers.get('location') | |
if not redirect: | |
regular_urls.add(url) | |
continue | |
parsed_redirect = urlparse(redirect) | |
# Relative URLs are fine but we only care about the destination | |
if parsed_redirect.netloc == '': | |
regular_urls.add(urljoin(sitemap_url, redirect)) | |
continue | |
# Note insecure redirects | |
if redirect and len(urlparse(redirect).netloc) > 0 and not redirect.startswith('https'): | |
insecure_redirects.add(url) | |
# Note redirects off the origin | |
elif urlparse(redirect).netloc != sitemap_origin: | |
secure_external_redirects.add(url) | |
# Secure redirects to the same host should be resolved to the redirect | |
else: | |
regular_urls.set(redirect) | |
# If early termination, print the data already gathered | |
except KeyboardInterrupt: | |
pass | |
print("\nInsecure redirects:", *sorted(insecure_redirects), sep="\n") | |
print("\nSecure external redirects:", *sorted(secure_external_redirects), sep="\n") | |
print("\nRegular URLs:", *sorted(regular_urls), sep="\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment