simonw · August 30, 2010 12:58
diff --git a/http_get_headers_and_truncated_body.py b/http_get_headers_and_truncated_body.py
 import httplib, urlparse, socket

 def http_get_headers_and_truncated_body(
        url, max_redirects=5, body_length=2048, timeout=5,
        allowed_content_types = ('text/html',), num_redirects_followed=0,
        redirect_chain = None
    ):
    redirect_chain = redirect_chain or []
    # Returns {'ok':True,'headers':{},'redirected':False,'url':'','body': '',
    #    'max_redirects_reached': False, 'num_redirects_followed': 0,
    #    'body_was_fetched': False, 'redirect_chain': []}
    # OR {'ok': False, 'error': 'error description'}
    if not url.startswith('http://') and not url.startswith('https://'):
        url = 'http://' + url
    bits = urlparse.urlsplit(url) # less weird than urlparse
    netloc = bits.netloc
    host = netloc
    port = 80
    if ':' in host:
        host, maybe_port = host.split(':')
        if maybe_port and maybe_port.isdigit():
            port = int(maybe_port)
    path = bits.path
    if bits.query:
        path += '?' + bits.query
    
    conn = httplib.HTTPConnection(
        host, port=port, strict=True, timeout=timeout
    )
    conn.request('GET', path)
    try:
        response = conn.getresponse()
    except httplib.BadStatusLine:
        conn.close()
        return {
            'ok': False,
            'error': 'Bad response - might not be a web server'
        }
    except socket.timeout:
        return {
            'ok': False,
            'error': 'Bad response - server timed out'
        }
    # It's a web server! Did it redirect us?
    headers = dict(response.getheaders())
    max_redirects_reached = False
    if response.status in (301, 302):
        if max_redirects and num_redirects_followed < max_redirects:
            location = headers.get('location')
            if location:
                redirect_chain.append(url)
                return http_get_headers_and_truncated_body(
                    location,
                    max_redirects=max_redirects, body_length=body_length,
                    timeout=timeout,
                    allowed_content_types=allowed_content_types,
                    num_redirects_followed = num_redirects_followed + 1,
                    redirect_chain = redirect_chain,
                )
            else:
                return {
                    'ok': False,
                    'error': 'Status was %s but no location header' % (
                        response.status
                    )
                }
        else:
            max_redirects_reached = True
    
    body = ''
    body_was_fetched = False
    content_type = headers.get('content-type', '').split(';')[0]
    # If it's a 200 and content type is valid, attempt to fetch the body
    if response.status == 200:
        if content_type in allowed_content_types:
            try:
                body = response.read(body_length)
                body_was_fetched = True
            except socket.timeout:
                pass
    
    conn.close()
    
    return {
        'ok': True,
        'headers': headers,
        'content_type': content_type,
        'status': response.status,
        'redirected': bool(num_redirects_followed),
        'url': url,
        'body': body,
        'body_was_fetched': body_was_fetched,
        'max_redirects_reached': max_redirects_reached,
        'num_redirects_followed': num_redirects_followed,
        'redirect_chain': redirect_chain,
    }
	import httplib, urlparse, socket

	def http_get_headers_and_truncated_body(
	url, max_redirects=5, body_length=2048, timeout=5,
	allowed_content_types = ('text/html',), num_redirects_followed=0,
	redirect_chain = None
	):
	redirect_chain = redirect_chain or []
	# Returns {'ok':True,'headers':{},'redirected':False,'url':'','body': '',
	# 'max_redirects_reached': False, 'num_redirects_followed': 0,
	# 'body_was_fetched': False, 'redirect_chain': []}
	# OR {'ok': False, 'error': 'error description'}
	if not url.startswith('http://') and not url.startswith('https://'):
	url = 'http://' + url
	bits = urlparse.urlsplit(url) # less weird than urlparse
	netloc = bits.netloc
	host = netloc
	port = 80
	if ':' in host:
	host, maybe_port = host.split(':')
	if maybe_port and maybe_port.isdigit():
	port = int(maybe_port)
	path = bits.path
	if bits.query:
	path += '?' + bits.query

	conn = httplib.HTTPConnection(
	host, port=port, strict=True, timeout=timeout
	)
	conn.request('GET', path)
	try:
	response = conn.getresponse()
	except httplib.BadStatusLine:
	conn.close()
	return {
	'ok': False,
	'error': 'Bad response - might not be a web server'
	}
	except socket.timeout:
	return {
	'ok': False,
	'error': 'Bad response - server timed out'
	}
	# It's a web server! Did it redirect us?
	headers = dict(response.getheaders())
	max_redirects_reached = False
	if response.status in (301, 302):
	if max_redirects and num_redirects_followed < max_redirects:
	location = headers.get('location')
	if location:
	redirect_chain.append(url)
	return http_get_headers_and_truncated_body(
	location,
	max_redirects=max_redirects, body_length=body_length,
	timeout=timeout,
	allowed_content_types=allowed_content_types,
	num_redirects_followed = num_redirects_followed + 1,
	redirect_chain = redirect_chain,
	)
	else:
	return {
	'ok': False,
	'error': 'Status was %s but no location header' % (
	response.status
	)
	}
	else:
	max_redirects_reached = True

	body = ''
	body_was_fetched = False
	content_type = headers.get('content-type', '').split(';')[0]
	# If it's a 200 and content type is valid, attempt to fetch the body
	if response.status == 200:
	if content_type in allowed_content_types:
	try:
	body = response.read(body_length)
	body_was_fetched = True
	except socket.timeout:
	pass

	conn.close()

	return {
	'ok': True,
	'headers': headers,
	'content_type': content_type,
	'status': response.status,
	'redirected': bool(num_redirects_followed),
	'url': url,
	'body': body,
	'body_was_fetched': body_was_fetched,
	'max_redirects_reached': max_redirects_reached,
	'num_redirects_followed': num_redirects_followed,
	'redirect_chain': redirect_chain,
	}