Last active
July 11, 2019 22:16
-
-
Save daskol/ae71b6af8a43f419914b25521ee26eb9 to your computer and use it in GitHub Desktop.
Simple script to generate sitemap for a specified domain.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # encoding: utf8 | |
| # filename: gensitemap.py | |
| """Simple script to generate sitemap for a specified domain. It traverse all | |
| HTML files from a given root directory and build URLs. | |
| """ | |
| import logging | |
| import xml.etree.ElementTree as etree | |
| from argparse import ArgumentParser, Namespace | |
| from datetime import datetime | |
| from os import walk | |
| from os.path import join, getmtime, getsize, realpath, relpath | |
| NAMESPACE = 'http://www.sitemaps.org/schemas/sitemap/0.9' | |
| parser = ArgumentParser(epilog=__doc__) | |
| parser.add_argument('-d', '--domain', | |
| default='daskol.xyz', | |
| help='domain to use in urls') | |
| parser.add_argument('rootdir', | |
| help='content directory', | |
| metavar='ROOT-DIRECTORY') | |
| parser.add_argument('sitemap', | |
| default='sitemap.xml', | |
| help='output sitemap', | |
| metavar='SITEMAP', | |
| nargs='?') | |
| def main(args: Namespace): | |
| logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', | |
| level=logging.INFO) | |
| logging.info('create root element') | |
| urlset = etree.Element('urlset', {'xmlns': NAMESPACE}) | |
| logging.info('walk directory %s', realpath(args.rootdir)) | |
| for curdir, dirnames, filenames in walk(args.rootdir): | |
| for filename in filenames: | |
| if not filename.endswith('.html'): | |
| continue | |
| logging.info('processing file %s', join(curdir, filename)) | |
| path = relpath(join(curdir, filename), args.rootdir) | |
| date = datetime.fromtimestamp(getmtime(path)).date() | |
| rank = 1 if filename == 'index.html' else 0.5 | |
| url = etree.SubElement(urlset, 'url') | |
| loc = etree.SubElement(url, 'loc') | |
| loc.text = f'https://{args.domain}/{path}' | |
| lastmod = etree.SubElement(url, 'lastmod') | |
| lastmod.text = date.strftime('%Y-%d-%m') | |
| priority = etree.SubElement(url, 'priority') | |
| priority.text = str(rank) | |
| logging.info('write XML document to %s', args.sitemap) | |
| sitemap = etree.ElementTree(urlset) | |
| sitemap.write(args.sitemap, xml_declaration=True, encoding='utf-8') | |
| logging.info('size of sitemap file is %s bytes', getsize(args.sitemap)) | |
| logging.info('done.') | |
| if __name__ == '__main__': | |
| main(parser.parse_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment