Skip to content

Instantly share code, notes, and snippets.

@saml
Last active December 18, 2015 17:09
Show Gist options
  • Select an option

  • Save saml/5816367 to your computer and use it in GitHub Desktop.

Select an option

Save saml/5816367 to your computer and use it in GitHub Desktop.
from lxml import etree
import requests
from dateutil.parser import parse as parsedate
import pytz
import sys
from cStringIO import StringIO
import gzip
import io
NS = '{http://www.sitemaps.org/schemas/sitemap/0.9}'
URL = './/%surl' % NS
LOC = './/%sloc' % NS
LASTMOD = './/%slastmod' % NS
DEFAULT_TZ = pytz.timezone('America/New_York')
def print_sitemap_urls(sitemap_url):
print(sitemap_url)
resp = requests.get(sitemap_url)
buff = io.BytesIO(resp.content)
f = gzip.GzipFile(fileobj=buff, mode='rb')
sitemap = etree.parse(f)
print_sitemap_urls_etree(sitemap)
def print_sitemap_urls_etree(sitemap):
count = 0
for url in sitemap.iterfind(URL):
loc = url.find(LOC).text
lastmod = url.find(LASTMOD)
if lastmod is not None:
lastmod = parsedate(lastmod.text)
if not lastmod.tzinfo:
lastmod = lastmod.replace(tzinfo = DEFAULT_TZ)
print(loc)
def main(index_url):
print(index_url)
sitemap_index = etree.parse(index_url)
if sitemap_index.getroot().tag.endswith('sitemapindex'):
for sitemap_url in sitemap_index.iterfind(LOC):
sitemap_url = sitemap_url.text.strip()
print_sitemap_urls(sitemap_url)
else:
print_sitemap_urls(sitemap_index)
if __name__ == '__main__':
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment