Skip to content

Instantly share code, notes, and snippets.

@joshmarshall
Created May 1, 2011 18:37
Show Gist options
  • Save joshmarshall/950720 to your computer and use it in GitHub Desktop.
Save joshmarshall/950720 to your computer and use it in GitHub Desktop.
URL Checker
from httplib import HTTP, HTTPConnection
from urlparse import urlparse
def get_page(url):
parsed = urlparse(url)
conn = HTTPConnection('%s' % parsed[1])
conn.request("GET", parsed[2])
response = conn.getresponse()
data = response.read()
return data
def get_urls(text):
import re
matches = re.findall(r'http://[^s<>"']+', text)
return list(set(matches))
def check_url(url):
url = url.strip()
parsed = urlparse(url)
request = HTTP(parsed[1])
request.putrequest('HEAD', parsed[2])
request.endheaders()
reply = request.getreply()
return reply[0]
if __name__ == '__main__':
import sys
import os
source = sys.argv[1]
data = ''
if os.access(os.path.abspath(source), os.R_OK):
print 'GETTING LOCAL FILE.'
data = open(os.path.abspath(source), 'r').read()
else:
print 'GETTING REMOTE FILE.'
data = get_page(source)
print 'SEARCHING FOR URLS.'
urls = get_urls(data)
codes = {}
print 'CHECKING %s URLS...' % len(urls)
for url in urls:
code = '%s' % check_url(url)
if code not in codes.keys():
codes[code] = []
codes[code].append(url)
print 'RESULTS:'
print '========'
for code, paths in codes.iteritems():
if 399 < int(code) < 500:
print 'There were %s %ss.' % (len(paths), code)
for path in paths:
print '* %s' % path
else:
print 'There were %s %ss.' % (len(paths), code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment