Created
January 27, 2020 18:40
-
-
Save PandaWhoCodes/0a397a0e9825c4166407ec243e716ea5 to your computer and use it in GitHub Desktop.
Sitemap Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse | |
import re | |
class Crawler: | |
def __init__(self, url, exclude=None, no_verbose=False): | |
self.url = self.normalize(url) | |
self.host = urlparse(self.url).netloc | |
self.exclude = exclude | |
self.no_verbose = no_verbose | |
self.found_links = [] | |
self.visited_links = [self.url] | |
def start(self): | |
self.crawl(self.url) | |
return self.found_links | |
def crawl(self, url): | |
if not self.no_verbose: | |
print("Parsing " + url) | |
try: | |
response = urllib.request.urlopen(url) | |
page = str(response.read()) | |
pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>' | |
found_links = re.findall(pattern, page) | |
except: | |
found_links = [] | |
links = [] | |
for link in found_links: | |
is_url = self.is_url(link) | |
if is_url: | |
is_internal = self.is_internal(link) | |
if is_internal: | |
self.add_url(link, links, self.exclude) | |
self.add_url(link, self.found_links, self.exclude) | |
for link in links: | |
if link not in self.visited_links: | |
link = self.normalize(link) | |
self.visited_links.append(link) | |
self.crawl(urljoin(self.url, link)) | |
def add_url(self, link, link_list, exclude_pattern=None): | |
link = self.normalize(link) | |
if link: | |
not_in_list = link not in link_list | |
excluded = False | |
if exclude_pattern: | |
excluded = re.search(exclude_pattern, link) | |
if not_in_list and not excluded: | |
link_list.append(link) | |
def normalize(self, url): | |
scheme, netloc, path, qs, anchor = urlsplit(url) | |
return urlunsplit((scheme, netloc, path, qs, anchor)) | |
def is_internal(self, url): | |
host = urlparse(url).netloc | |
return host == self.host or host == '' | |
def is_url(self, url): | |
scheme, netloc, path, qs, anchor = urlsplit(url) | |
if url != '' and scheme in ['http', 'https', '']: | |
return True | |
else: | |
return False |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from crawler import Crawler | |
# initializing parameters | |
parser = argparse.ArgumentParser(description="Sitemap generator") | |
parser.add_argument('--url', action="store", default="", help="For example https://www.focusinfotech.com") | |
parser.add_argument('--exclude', action="store", default="", help="regex pattern to exclude. For example 'okay/lol' will exclude https://www.focusinfotech.com/symbol/okay/lol") | |
parser.add_argument('--no-verbose', action="store_true", default="", help="print verbose output") | |
parser.add_argument('--output', action="store", default="sitemap.xml", help="File path for output, if file exists it will be overwritten") | |
# parsing parameters | |
args = parser.parse_args() | |
url = args.url.rstrip("/") | |
found_links = [] | |
# initializeing crawler | |
crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose); | |
# fetch links | |
links = crawler.start() | |
#write into file | |
with open(args.output, "w") as file: | |
file.write('<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') | |
for link in links: | |
file.write("\n\t\t<url>\n\t\t\t<loc>\n\t\t\t\t{0}{1}/\n\t\t\t</loc>\n\t\t</url>".format(url, link)) | |
file.write('</urlset>') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment