fission6 · April 11, 2013 15:05
diff --git a/broken_links.py b/broken_links.py
 """
 usage: python broken_links.py somedomain.com > broken_links.list

 somedomain.com/sitemap.xml will be grabbed and parsed
 for each location entry, the page is fetched and all links on that page
 make a HEAD request. The output is a summary of broken links and what entries or locations
 contained them from sitemap.xml

 this offeres a crude but effective way to spot essential pages across a domain.
 """

 import sys
 import requests
 from lxml import html

 LINK_CACHE = {}


 def fetch_locations_from_sitemap(domain):

    sitemap_root = html.parse('http://' + domain + '/sitemap.xml')
    locations = sitemap_root.xpath('//url/loc')

    return locations


 def check_pages_links(url, page_root):

    for element, attribute, link, pos in page_root.iterlinks():

        # probably should make all urls absolute, etc.
        if link.startswith('/') and not link.startswith('//'):
            # if link is relative then fetch and ensure a 200
            link = 'http://' + domain + link

        if link in LINK_CACHE:
            LINK_CACHE[link]['pages'].append(url)

        else:

            if link.startswith('http'):

                resp = requests.head(link)

                LINK_CACHE[link] = {
                    'pages': [url],
                    'status_code': resp.status_code
                }


 def broken_link_report():
    """
    Loop through and show broken links and the pages they are on.
    """

    for link, value in LINK_CACHE.iteritems():
        if value['status_code'] != 200:
            print "** '{0}' is potentially broken ({1})".format(link, value['status_code'])
            for page in value['pages']:
                print "   - {0}".format(page)


 def find_broken_links(domain):

    locations = fetch_locations_from_sitemap(domain)

    for location in locations:

        # test each url gives a 200
        url = location.text

        # find broken links for sitemap location
        page_root = html.parse(url).getroot()

        check_pages_links(url, page_root)

    broken_link_report()


 if __name__ == "__main__":
    domain = sys.argv[1]
    print "Finding broken links using the sitemap from the following domain: {0}".format(domain)
    find_broken_links(domain)
	"""
	usage: python broken_links.py somedomain.com > broken_links.list

	somedomain.com/sitemap.xml will be grabbed and parsed
	for each location entry, the page is fetched and all links on that page
	make a HEAD request. The output is a summary of broken links and what entries or locations
	contained them from sitemap.xml

	this offeres a crude but effective way to spot essential pages across a domain.
	"""

	import sys
	import requests
	from lxml import html

	LINK_CACHE = {}


	def fetch_locations_from_sitemap(domain):

	sitemap_root = html.parse('http://' + domain + '/sitemap.xml')
	locations = sitemap_root.xpath('//url/loc')

	return locations


	def check_pages_links(url, page_root):

	for element, attribute, link, pos in page_root.iterlinks():

	# probably should make all urls absolute, etc.
	if link.startswith('/') and not link.startswith('//'):
	# if link is relative then fetch and ensure a 200
	link = 'http://' + domain + link

	if link in LINK_CACHE:
	LINK_CACHE[link]['pages'].append(url)

	else:

	if link.startswith('http'):

	resp = requests.head(link)

	LINK_CACHE[link] = {
	'pages': [url],
	'status_code': resp.status_code
	}


	def broken_link_report():
	"""
	Loop through and show broken links and the pages they are on.
	"""

	for link, value in LINK_CACHE.iteritems():
	if value['status_code'] != 200:
	print "** '{0}' is potentially broken ({1})".format(link, value['status_code'])
	for page in value['pages']:
	print " - {0}".format(page)


	def find_broken_links(domain):

	locations = fetch_locations_from_sitemap(domain)

	for location in locations:

	# test each url gives a 200
	url = location.text

	# find broken links for sitemap location
	page_root = html.parse(url).getroot()

	check_pages_links(url, page_root)

	broken_link_report()


	if __name__ == "__main__":
	domain = sys.argv[1]
	print "Finding broken links using the sitemap from the following domain: {0}".format(domain)
	find_broken_links(domain)