Skip to content

Instantly share code, notes, and snippets.

@bmcculley
Forked from chrisguitarguy/xml.py
Last active January 10, 2022 02:52
Show Gist options
  • Save bmcculley/e3d269e149ad3b7c9e6a1f19e7f487d9 to your computer and use it in GitHub Desktop.
Save bmcculley/e3d269e149ad3b7c9e6a1f19e7f487d9 to your computer and use it in GitHub Desktop.
Parse an XML sitemap with Python, requests and BeautifulSoup4 (works with python3)
from argparse import ArgumentParser
import requests
from bs4 import BeautifulSoup
def parse_sitemap(url):
resp = requests.get(url)
# we didn't get a valid response, bail
if 200 != resp.status_code:
return False
# BeautifulStoneSoup to parse the document
soup = BeautifulSoup(resp.content, 'xml')
# find all the <url> tags in the document
urls = soup.findAll('url')
# no urls? bail
if not urls:
return False
# storage for later...
out = []
#extract what we need from the url
for u in urls:
loc = u.find('loc').string
# not a sitemap requirement skip if not present
if u.find('priority'):
prio = u.find('priority').string
else:
prio = 'N/A'
# not a sitemap requirement skip if not present
if u.find('changefreq'):
change = u.find('changefreq').string
else:
change = 'N/A'
# not a sitemap requirement skip if not present
if u.find('lastmod'):
last = u.find('lastmod').string
else:
last = 'N/A'
out.append([loc, prio, change, last])
return out
if __name__ == '__main__':
options = ArgumentParser()
options.add_argument('-u', '--url', action='store', dest='url', help='The file contain one url per line')
options.add_argument('-o', '--output', action='store', dest='out', default='out.txt', help='Where you would like to save the data')
args = options.parse_args()
urls = parse_sitemap(args.url)
if not urls:
print('There was an error!')
with open(args.out, 'w') as out:
for u in urls:
out.write('\t'.join([i.encode('utf-8') for i in u]) + '\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment