Skip to content

Instantly share code, notes, and snippets.

@simonw
Created August 30, 2010 12:58
Show Gist options
  • Save simonw/557365 to your computer and use it in GitHub Desktop.
Save simonw/557365 to your computer and use it in GitHub Desktop.
Do a GET against a URL, timing out sensibly, following up to a max number of redirects and pulling down only the first X bytes of the body
import httplib, urlparse, socket
def http_get_headers_and_truncated_body(
url, max_redirects=5, body_length=2048, timeout=5,
allowed_content_types = ('text/html',), num_redirects_followed=0,
redirect_chain = None
):
redirect_chain = redirect_chain or []
# Returns {'ok':True,'headers':{},'redirected':False,'url':'','body': '',
# 'max_redirects_reached': False, 'num_redirects_followed': 0,
# 'body_was_fetched': False, 'redirect_chain': []}
# OR {'ok': False, 'error': 'error description'}
if not url.startswith('http://') and not url.startswith('https://'):
url = 'http://' + url
bits = urlparse.urlsplit(url) # less weird than urlparse
netloc = bits.netloc
host = netloc
port = 80
if ':' in host:
host, maybe_port = host.split(':')
if maybe_port and maybe_port.isdigit():
port = int(maybe_port)
path = bits.path
if bits.query:
path += '?' + bits.query
conn = httplib.HTTPConnection(
host, port=port, strict=True, timeout=timeout
)
conn.request('GET', path)
try:
response = conn.getresponse()
except httplib.BadStatusLine:
conn.close()
return {
'ok': False,
'error': 'Bad response - might not be a web server'
}
except socket.timeout:
return {
'ok': False,
'error': 'Bad response - server timed out'
}
# It's a web server! Did it redirect us?
headers = dict(response.getheaders())
max_redirects_reached = False
if response.status in (301, 302):
if max_redirects and num_redirects_followed < max_redirects:
location = headers.get('location')
if location:
redirect_chain.append(url)
return http_get_headers_and_truncated_body(
location,
max_redirects=max_redirects, body_length=body_length,
timeout=timeout,
allowed_content_types=allowed_content_types,
num_redirects_followed = num_redirects_followed + 1,
redirect_chain = redirect_chain,
)
else:
return {
'ok': False,
'error': 'Status was %s but no location header' % (
response.status
)
}
else:
max_redirects_reached = True
body = ''
body_was_fetched = False
content_type = headers.get('content-type', '').split(';')[0]
# If it's a 200 and content type is valid, attempt to fetch the body
if response.status == 200:
if content_type in allowed_content_types:
try:
body = response.read(body_length)
body_was_fetched = True
except socket.timeout:
pass
conn.close()
return {
'ok': True,
'headers': headers,
'content_type': content_type,
'status': response.status,
'redirected': bool(num_redirects_followed),
'url': url,
'body': body,
'body_was_fetched': body_was_fetched,
'max_redirects_reached': max_redirects_reached,
'num_redirects_followed': num_redirects_followed,
'redirect_chain': redirect_chain,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment