Skip to content

Instantly share code, notes, and snippets.

@PandaWhoCodes
Created January 27, 2020 18:40
Show Gist options
  • Save PandaWhoCodes/0a397a0e9825c4166407ec243e716ea5 to your computer and use it in GitHub Desktop.
Save PandaWhoCodes/0a397a0e9825c4166407ec243e716ea5 to your computer and use it in GitHub Desktop.
Sitemap Generator
import urllib.request
from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
import re
class Crawler:
def __init__(self, url, exclude=None, no_verbose=False):
self.url = self.normalize(url)
self.host = urlparse(self.url).netloc
self.exclude = exclude
self.no_verbose = no_verbose
self.found_links = []
self.visited_links = [self.url]
def start(self):
self.crawl(self.url)
return self.found_links
def crawl(self, url):
if not self.no_verbose:
print("Parsing " + url)
try:
response = urllib.request.urlopen(url)
page = str(response.read())
pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'
found_links = re.findall(pattern, page)
except:
found_links = []
links = []
for link in found_links:
is_url = self.is_url(link)
if is_url:
is_internal = self.is_internal(link)
if is_internal:
self.add_url(link, links, self.exclude)
self.add_url(link, self.found_links, self.exclude)
for link in links:
if link not in self.visited_links:
link = self.normalize(link)
self.visited_links.append(link)
self.crawl(urljoin(self.url, link))
def add_url(self, link, link_list, exclude_pattern=None):
link = self.normalize(link)
if link:
not_in_list = link not in link_list
excluded = False
if exclude_pattern:
excluded = re.search(exclude_pattern, link)
if not_in_list and not excluded:
link_list.append(link)
def normalize(self, url):
scheme, netloc, path, qs, anchor = urlsplit(url)
return urlunsplit((scheme, netloc, path, qs, anchor))
def is_internal(self, url):
host = urlparse(url).netloc
return host == self.host or host == ''
def is_url(self, url):
scheme, netloc, path, qs, anchor = urlsplit(url)
if url != '' and scheme in ['http', 'https', '']:
return True
else:
return False
import argparse
from crawler import Crawler
# initializing parameters
parser = argparse.ArgumentParser(description="Sitemap generator")
parser.add_argument('--url', action="store", default="", help="For example https://www.focusinfotech.com")
parser.add_argument('--exclude', action="store", default="", help="regex pattern to exclude. For example 'okay/lol' will exclude https://www.focusinfotech.com/symbol/okay/lol")
parser.add_argument('--no-verbose', action="store_true", default="", help="print verbose output")
parser.add_argument('--output', action="store", default="sitemap.xml", help="File path for output, if file exists it will be overwritten")
# parsing parameters
args = parser.parse_args()
url = args.url.rstrip("/")
found_links = []
# initializeing crawler
crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose);
# fetch links
links = crawler.start()
#write into file
with open(args.output, "w") as file:
file.write('<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
for link in links:
file.write("\n\t\t<url>\n\t\t\t<loc>\n\t\t\t\t{0}{1}/\n\t\t\t</loc>\n\t\t</url>".format(url, link))
file.write('</urlset>')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment