Created
May 26, 2017 16:14
-
-
Save pschichtel/2216308fe22a580dbd90ce8114512095 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import print_function | |
import sys, re, math | |
from requests import Session | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:48.0) Gecko/20100101 Firefox/48.0' | |
} | |
if (len(sys.argv) < 3): | |
print("Usage: %s <url> <initial path>" % sys.argv[0]) | |
sys.exit(1) | |
base_url = sys.argv[1] | |
initial_path = sys.argv[2] | |
s = Session() | |
s.verify = True | |
s.headers.update(headers) | |
def resolve_target(base, target): | |
if target.startswith('http:') or target.startswith('https:'): | |
return target | |
elif target.startswith('/'): | |
return base + target | |
else: | |
print('TODO relative redirects not implemented') | |
sys.exit(1) | |
return target | |
def detect_encoding_by_bom(response): | |
content = response.content | |
if len(content) > 4: | |
first = content[0] | |
second = content[1] | |
third = content[2] | |
forth = content[3] | |
if first == 0x00 and second == 0x00 and third == 0xFE and forth == 0xFF: | |
return 'utf_32_be' | |
elif first == 0xFF and second == 0xFE and third == 0x00 and forth == 0x00: | |
return 'utf_32_le' | |
if len(content) > 3: | |
first = content[0] | |
second = content[1] | |
third = content[2] | |
if first == 0xEF and second == 0xBB and third == 0xBF: | |
return 'utf_8' | |
if len(content) > 2: | |
first = content[0] | |
second = content[1] | |
if first == 0xFE and second == 0xFF: | |
return 'utf_16_be' | |
elif first == 0xFF and second == 0xFE: | |
return 'utf_16_le' | |
return None | |
def with_encoding(response): | |
response.encoding = detect_encoding_by_bom(response) | |
return response | |
def find_redirect_tag(response): | |
match = re.search(r"<meta\s+http-equiv=(\"|')refresh\1\s+content=(\"|')(.*?)\2\s*/?>", response.text, re.I) | |
if not match: | |
return None | |
return match.group(3) | |
def follow_redirects(base, response): | |
response = with_encoding(response) | |
if 'location' in response.headers: | |
url = response.headers['location'] | |
print('Redirect via location header: %s' % url) | |
else: | |
if 'refresh' in response.headers: | |
refresh = response.headers['refresh'] | |
print('Redirect via refresh header: %s' % refresh) | |
else: | |
refresh = find_redirect_tag(response) | |
if refresh: print('Redirect via refresh meta tag: %s' % refresh) | |
if not refresh: | |
return response | |
url_match = re.search("URL=(.*)$", refresh) | |
if not url_match: | |
return response | |
url = url_match.group(1) | |
print("Following redirect to %s" % url) | |
new_response = get_site(resolve_target(base, url), response) | |
return follow_redirects(base, new_response) | |
def get_site(url, prev=None): | |
new_response = s.get(url, allow_redirects=False) | |
if (prev): | |
new_response.history.append(prev) | |
return new_response | |
print("Beginning trace to %s" % base_url + initial_path) | |
follow_redirects(base_url, get_site(base_url + initial_path)) | |
print("Arrived at the site!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment