Skip to content

Instantly share code, notes, and snippets.

@saml
Created February 19, 2013 19:01
Show Gist options
  • Save saml/4988805 to your computer and use it in GitHub Desktop.
Save saml/4988805 to your computer and use it in GitHub Desktop.
extract <loc> urls from sitemap.xml
import sys
import os
import argparse
import gzip
import requests
try:
from lxml import etree
except ImportError:
import xml.etree.cElementTree as etree
LOC = './/{http://www.sitemaps.org/schemas/sitemap/0.9}loc'
class Sitemap(object):
def __init__(self, workdir):
self.workdir = workdir
def download(self, url):
filename = os.path.basename(url)
filepath = os.path.join(self.workdir, filename)
response = requests.get(url, stream=True)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=4048):
f.write(chunk)
return filepath
def urls(self, sitemap_path):
with open(sitemap_path, 'r') as f:
sitemap = etree.parse(f)
locs = sitemap.iterfind(LOC)
for loc in locs:
yield loc.text
def main():
parser = argparse.ArgumentParser(description='downloads sitemap and prints all urls')
parser.add_argument('command', help='download|urls')
parser.add_argument('sitemaps', nargs="+", help='main sitemap files. each file should have <loc> to gzip files.')
parser.add_argument('--workdir', help='temporary work directory [%(default)s]', default=os.path.abspath('./tmp'))
args = parser.parse_args()
if args.command == 'download':
sitemap = Sitemap(args.workdir)
for sitemap_path in args.sitemaps:
for url in sitemap.urls(sitemap_path):
print(sitemap.download(url))
elif args.command == 'urls':
sitemap = Sitemap(args.workdir)
for sitemap_path in args.sitemaps:
for url in sitemap.urls(sitemap_path):
print(url)
if __name__ == '__main__':
main()
@lessandro
Copy link

Check results

E302:16:1:expected 2 blank lines, found 1
W293:19:1:blank line contains whitespace
E303:38:1:too many blank lines (3)
E501:39:80:line too long (89 > 79 characters)
E501:41:80:line too long (117 > 79 characters)
E501:42:80:line too long (117 > 79 characters)
W292:58:11:no newline at end of file

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment