Skip to content

Instantly share code, notes, and snippets.

@mdrovdahl
Last active April 12, 2017 23:06
Show Gist options
  • Save mdrovdahl/e052257ee7ffec0fbd147a496fb6bcf5 to your computer and use it in GitHub Desktop.
Save mdrovdahl/e052257ee7ffec0fbd147a496fb6bcf5 to your computer and use it in GitHub Desktop.
Find missing dates in example.com/sitemap.xml
#!/usr/bin/env python3
import requests, argparse, untangle
from urlparse import urlparse, parse_qs
from datetime import datetime, date, timedelta
domain = "https://example.com/"
path = "/sitemap.xml"
def main():
"""Find missing dates in exaple.com/sitemap.xml
"""
# hacky: earliest date we expect to find
date = datetime.strptime("20080809", '%Y%m%d')
parser = argparse.ArgumentParser(description='walk a sitemap and look for gaps')
parser.add_argument('-url','--url', help='url', required=True)
args = vars(parser.parse_args())
# rudimentary evaluation of the supplied arguments
if len(args['url']) >0:
# build the url for the request
url= args['url']+path
# request the request
r = requests.get(url)
# walk the sitemap.xml, extract just the urls
obj = untangle.parse(r.text)
# work down to the urls eg: https://example.com/sitemap.xml?yyyy=2017&mm=02&dd=25
for sitemap in obj.sitemapindex.sitemap:
# extract the components and dates
url = sitemap.loc.cdata
parsed = urlparse(url)
year = parse_qs(parsed.query)["yyyy"]
month = parse_qs(parsed.query)["mm"]
day = parse_qs(parsed.query)["dd"]
newdate = datetime.strptime(year[0]+month[0]+day[0], '%Y%m%d')
#print date
if date-newdate > timedelta(days=1):
print "missing", date-timedelta(days=1)
date = newdate
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment