Created
August 30, 2010 12:58
-
-
Save simonw/557365 to your computer and use it in GitHub Desktop.
Do a GET against a URL, timing out sensibly, following up to a max number of redirects and pulling down only the first X bytes of the body
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httplib, urlparse, socket | |
def http_get_headers_and_truncated_body( | |
url, max_redirects=5, body_length=2048, timeout=5, | |
allowed_content_types = ('text/html',), num_redirects_followed=0, | |
redirect_chain = None | |
): | |
redirect_chain = redirect_chain or [] | |
# Returns {'ok':True,'headers':{},'redirected':False,'url':'','body': '', | |
# 'max_redirects_reached': False, 'num_redirects_followed': 0, | |
# 'body_was_fetched': False, 'redirect_chain': []} | |
# OR {'ok': False, 'error': 'error description'} | |
if not url.startswith('http://') and not url.startswith('https://'): | |
url = 'http://' + url | |
bits = urlparse.urlsplit(url) # less weird than urlparse | |
netloc = bits.netloc | |
host = netloc | |
port = 80 | |
if ':' in host: | |
host, maybe_port = host.split(':') | |
if maybe_port and maybe_port.isdigit(): | |
port = int(maybe_port) | |
path = bits.path | |
if bits.query: | |
path += '?' + bits.query | |
conn = httplib.HTTPConnection( | |
host, port=port, strict=True, timeout=timeout | |
) | |
conn.request('GET', path) | |
try: | |
response = conn.getresponse() | |
except httplib.BadStatusLine: | |
conn.close() | |
return { | |
'ok': False, | |
'error': 'Bad response - might not be a web server' | |
} | |
except socket.timeout: | |
return { | |
'ok': False, | |
'error': 'Bad response - server timed out' | |
} | |
# It's a web server! Did it redirect us? | |
headers = dict(response.getheaders()) | |
max_redirects_reached = False | |
if response.status in (301, 302): | |
if max_redirects and num_redirects_followed < max_redirects: | |
location = headers.get('location') | |
if location: | |
redirect_chain.append(url) | |
return http_get_headers_and_truncated_body( | |
location, | |
max_redirects=max_redirects, body_length=body_length, | |
timeout=timeout, | |
allowed_content_types=allowed_content_types, | |
num_redirects_followed = num_redirects_followed + 1, | |
redirect_chain = redirect_chain, | |
) | |
else: | |
return { | |
'ok': False, | |
'error': 'Status was %s but no location header' % ( | |
response.status | |
) | |
} | |
else: | |
max_redirects_reached = True | |
body = '' | |
body_was_fetched = False | |
content_type = headers.get('content-type', '').split(';')[0] | |
# If it's a 200 and content type is valid, attempt to fetch the body | |
if response.status == 200: | |
if content_type in allowed_content_types: | |
try: | |
body = response.read(body_length) | |
body_was_fetched = True | |
except socket.timeout: | |
pass | |
conn.close() | |
return { | |
'ok': True, | |
'headers': headers, | |
'content_type': content_type, | |
'status': response.status, | |
'redirected': bool(num_redirects_followed), | |
'url': url, | |
'body': body, | |
'body_was_fetched': body_was_fetched, | |
'max_redirects_reached': max_redirects_reached, | |
'num_redirects_followed': num_redirects_followed, | |
'redirect_chain': redirect_chain, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment