Skip to content

Instantly share code, notes, and snippets.

@daskol
Last active July 11, 2019 22:16
Show Gist options
  • Save daskol/ae71b6af8a43f419914b25521ee26eb9 to your computer and use it in GitHub Desktop.
Save daskol/ae71b6af8a43f419914b25521ee26eb9 to your computer and use it in GitHub Desktop.
Simple script to generate sitemap for a specified domain.
#!/usr/bin/env python3
# encoding: utf8
# filename: gensitemap.py
"""Simple script to generate sitemap for a specified domain. It traverse all
HTML files from a given root directory and build URLs.
"""
import logging
import xml.etree.ElementTree as etree
from argparse import ArgumentParser, Namespace
from datetime import datetime
from os import walk
from os.path import join, getmtime, getsize, realpath, relpath
NAMESPACE = 'http://www.sitemaps.org/schemas/sitemap/0.9'
parser = ArgumentParser(epilog=__doc__)
parser.add_argument('-d', '--domain',
default='daskol.xyz',
help='domain to use in urls')
parser.add_argument('rootdir',
help='content directory',
metavar='ROOT-DIRECTORY')
parser.add_argument('sitemap',
default='sitemap.xml',
help='output sitemap',
metavar='SITEMAP',
nargs='?')
def main(args: Namespace):
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=logging.INFO)
logging.info('create root element')
urlset = etree.Element('urlset', {'xmlns': NAMESPACE})
logging.info('walk directory %s', realpath(args.rootdir))
for curdir, dirnames, filenames in walk(args.rootdir):
for filename in filenames:
if not filename.endswith('.html'):
continue
logging.info('processing file %s', join(curdir, filename))
path = relpath(join(curdir, filename), args.rootdir)
date = datetime.fromtimestamp(getmtime(path)).date()
rank = 1 if filename == 'index.html' else 0.5
url = etree.SubElement(urlset, 'url')
loc = etree.SubElement(url, 'loc')
loc.text = f'https://{args.domain}/{path}'
lastmod = etree.SubElement(url, 'lastmod')
lastmod.text = date.strftime('%Y-%d-%m')
priority = etree.SubElement(url, 'priority')
priority.text = str(rank)
logging.info('write XML document to %s', args.sitemap)
sitemap = etree.ElementTree(urlset)
sitemap.write(args.sitemap, xml_declaration=True, encoding='utf-8')
logging.info('size of sitemap file is %s bytes', getsize(args.sitemap))
logging.info('done.')
if __name__ == '__main__':
main(parser.parse_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment