Last active
April 12, 2017 23:06
-
-
Save mdrovdahl/e052257ee7ffec0fbd147a496fb6bcf5 to your computer and use it in GitHub Desktop.
Find missing dates in example.com/sitemap.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests, argparse, untangle | |
from urlparse import urlparse, parse_qs | |
from datetime import datetime, date, timedelta | |
domain = "https://example.com/" | |
path = "/sitemap.xml" | |
def main(): | |
"""Find missing dates in exaple.com/sitemap.xml | |
""" | |
# hacky: earliest date we expect to find | |
date = datetime.strptime("20080809", '%Y%m%d') | |
parser = argparse.ArgumentParser(description='walk a sitemap and look for gaps') | |
parser.add_argument('-url','--url', help='url', required=True) | |
args = vars(parser.parse_args()) | |
# rudimentary evaluation of the supplied arguments | |
if len(args['url']) >0: | |
# build the url for the request | |
url= args['url']+path | |
# request the request | |
r = requests.get(url) | |
# walk the sitemap.xml, extract just the urls | |
obj = untangle.parse(r.text) | |
# work down to the urls eg: https://example.com/sitemap.xml?yyyy=2017&mm=02&dd=25 | |
for sitemap in obj.sitemapindex.sitemap: | |
# extract the components and dates | |
url = sitemap.loc.cdata | |
parsed = urlparse(url) | |
year = parse_qs(parsed.query)["yyyy"] | |
month = parse_qs(parsed.query)["mm"] | |
day = parse_qs(parsed.query)["dd"] | |
newdate = datetime.strptime(year[0]+month[0]+day[0], '%Y%m%d') | |
#print date | |
if date-newdate > timedelta(days=1): | |
print "missing", date-timedelta(days=1) | |
date = newdate | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment