Skip to content

Instantly share code, notes, and snippets.

@miodeqqq
Created December 1, 2016 19:37
Show Gist options
  • Save miodeqqq/ec77957a76b55bf57195a4b68998f90f to your computer and use it in GitHub Desktop.
Save miodeqqq/ec77957a76b55bf57195a4b68998f90f to your computer and use it in GitHub Desktop.
Python BS4 sitemap validator - checks HTTP Response for all links inside <loc> .. </loc> tags
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import time
from time import sleep
def process_sitemap(s):
soup = BeautifulSoup(s, "lxml")
return [loc.text for loc in soup.findAll('loc')]
def main():
with requests.Session() as s:
s.headers.update(
{
'User-Agent': 'Mozilla/5.0'
}
)
robots = s.get('http://www.yoursite.com/robots.txt')
for line in robots.text.splitlines():
if line.strip().endswith('.xml') and 'sitemap' in line:
sitemap = 'http://' + line.split()[-1]
for url in process_sitemap(s.get(sitemap).text):
try:
print ('{url} {status_code}'.format(
url=url,
status_code=s.get(url).status_code
)
)
except requests.exceptions.ConnectionError as e:
print('Error --> {}'.format(e))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment