Created
April 23, 2014 15:40
-
-
Save gregturn/11220444 to your computer and use it in GitHub Desktop.
Simple crawler to look for bad links on a site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import socket | |
import sys | |
import urllib2 | |
from urlparse import urlparse | |
socket.setdefaulttimeout(10) | |
domain = "spring.io" | |
excluded_domains = ["jira.spring.io", "forum.spring.io"] | |
levels = 0 | |
level_limit = -1 | |
def strip(link): | |
if link.endswith("/"): | |
return link[:-1] | |
else: | |
return link | |
def convert(link): | |
if link.startswith("http"): | |
return strip(link) | |
if link.startswith("//"): | |
return "http:%s" % link | |
if link.startswith("/"): | |
l = strip("http://%s%s" % (domain, link)) | |
#print "Returning %s" % l | |
return l | |
return strip("http://%s" % link) | |
def scan_for_links(url, links, dead_links, paths): | |
global levels | |
levels += 1 | |
if level_limit > 0 and levels > level_limit: | |
#print "You have exceeded the limit of %s. Dropping back" % level_limit | |
return | |
print "Crawling %s" % url | |
try: | |
page = urllib2.urlopen(url) | |
except urllib2.HTTPError as e: | |
raise e | |
except urllib2.URLError as e: | |
print "Broke while scanning %s" % url | |
print e | |
html = page.read() | |
local_links = [] | |
m = re.findall(r'href="([\w:/\.-]+)"', html) | |
#print "Found %s links" % len(m) | |
for link in m: | |
converted_link = convert(link) | |
if converted_link not in links: | |
links.append(converted_link) | |
local_links.append(converted_link) | |
else: | |
#print "%s is already in it!" % converted_link | |
pass | |
for link in local_links: | |
#print "Newly found! %s" % link | |
pass | |
if len(local_links) > 0: | |
for link in local_links: | |
if domain in link: | |
hostname = urlparse(link).netloc | |
if hostname in excluded_domains: | |
continue | |
try: | |
scan_for_links(link, links, dead_links, paths) | |
if link in paths: | |
paths[link].append(url) | |
else: | |
paths[link] = [url] | |
except urllib2.HTTPError as e: | |
print "Can't find link on %s" % link | |
print e | |
if e.code == 404: | |
dead_links.append((url, link)) | |
levels -= 1 | |
return | |
else: | |
#print "%s leaves this domain, so not crawling" % link | |
pass | |
else: | |
#print "No new links to crawl" | |
pass | |
levels -= 1 | |
if __name__ == "__main__": | |
links = [] | |
dead_links = [] | |
paths = {} | |
scan_for_links("http://spring.io", links, dead_links, paths) | |
print "============ GOOD ======================" | |
links.sort() | |
for link in links: | |
print link | |
print "============ BAD ======================" | |
for link in dead_links: | |
print link | |
#print "============ PATHS ======================" | |
keys = paths.keys() | |
keys.sort() | |
for link in keys: | |
print link | |
for url in paths[link]: | |
print "\t\t\t\t\t\t\t%s" % url | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment