Skip to content

Instantly share code, notes, and snippets.

@pschichtel
Created May 26, 2017 16:14
Show Gist options
  • Save pschichtel/2216308fe22a580dbd90ce8114512095 to your computer and use it in GitHub Desktop.
Save pschichtel/2216308fe22a580dbd90ce8114512095 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
from __future__ import print_function
import sys, re, math
from requests import Session
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:48.0) Gecko/20100101 Firefox/48.0'
}
if (len(sys.argv) < 3):
print("Usage: %s <url> <initial path>" % sys.argv[0])
sys.exit(1)
base_url = sys.argv[1]
initial_path = sys.argv[2]
s = Session()
s.verify = True
s.headers.update(headers)
def resolve_target(base, target):
if target.startswith('http:') or target.startswith('https:'):
return target
elif target.startswith('/'):
return base + target
else:
print('TODO relative redirects not implemented')
sys.exit(1)
return target
def detect_encoding_by_bom(response):
content = response.content
if len(content) > 4:
first = content[0]
second = content[1]
third = content[2]
forth = content[3]
if first == 0x00 and second == 0x00 and third == 0xFE and forth == 0xFF:
return 'utf_32_be'
elif first == 0xFF and second == 0xFE and third == 0x00 and forth == 0x00:
return 'utf_32_le'
if len(content) > 3:
first = content[0]
second = content[1]
third = content[2]
if first == 0xEF and second == 0xBB and third == 0xBF:
return 'utf_8'
if len(content) > 2:
first = content[0]
second = content[1]
if first == 0xFE and second == 0xFF:
return 'utf_16_be'
elif first == 0xFF and second == 0xFE:
return 'utf_16_le'
return None
def with_encoding(response):
response.encoding = detect_encoding_by_bom(response)
return response
def find_redirect_tag(response):
match = re.search(r"<meta\s+http-equiv=(\"|')refresh\1\s+content=(\"|')(.*?)\2\s*/?>", response.text, re.I)
if not match:
return None
return match.group(3)
def follow_redirects(base, response):
response = with_encoding(response)
if 'location' in response.headers:
url = response.headers['location']
print('Redirect via location header: %s' % url)
else:
if 'refresh' in response.headers:
refresh = response.headers['refresh']
print('Redirect via refresh header: %s' % refresh)
else:
refresh = find_redirect_tag(response)
if refresh: print('Redirect via refresh meta tag: %s' % refresh)
if not refresh:
return response
url_match = re.search("URL=(.*)$", refresh)
if not url_match:
return response
url = url_match.group(1)
print("Following redirect to %s" % url)
new_response = get_site(resolve_target(base, url), response)
return follow_redirects(base, new_response)
def get_site(url, prev=None):
new_response = s.get(url, allow_redirects=False)
if (prev):
new_response.history.append(prev)
return new_response
print("Beginning trace to %s" % base_url + initial_path)
follow_redirects(base_url, get_site(base_url + initial_path))
print("Arrived at the site!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment