PandaWhoCodes · January 27, 2020 18:40
diff --git a/crawler.py b/crawler.py
 import urllib.request
 from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
 import re


 class Crawler:
    def __init__(self, url, exclude=None, no_verbose=False):

        self.url = self.normalize(url)
        self.host = urlparse(self.url).netloc
        self.exclude = exclude
        self.no_verbose = no_verbose
        self.found_links = []
        self.visited_links = [self.url]

    def start(self):
        self.crawl(self.url)

        return self.found_links

    def crawl(self, url):
        if not self.no_verbose:
            print("Parsing " + url)
        try:

            response = urllib.request.urlopen(url)
            page = str(response.read())

            pattern = '<a [^>]*href=[\'|"](.*?)[\'"].*?>'

            found_links = re.findall(pattern, page)
        except:
            found_links = []
        links = []

        for link in found_links:
            is_url = self.is_url(link)

            if is_url:
                is_internal = self.is_internal(link)

                if is_internal:
                    self.add_url(link, links, self.exclude)
                    self.add_url(link, self.found_links, self.exclude)

        for link in links:
            if link not in self.visited_links:
                link = self.normalize(link)

                self.visited_links.append(link)
                self.crawl(urljoin(self.url, link))

    def add_url(self, link, link_list, exclude_pattern=None):
        link = self.normalize(link)

        if link:
            not_in_list = link not in link_list

            excluded = False

            if exclude_pattern:
                excluded = re.search(exclude_pattern, link)

            if not_in_list and not excluded:
                link_list.append(link)

    def normalize(self, url):
        scheme, netloc, path, qs, anchor = urlsplit(url)
        return urlunsplit((scheme, netloc, path, qs, anchor))

    def is_internal(self, url):
        host = urlparse(url).netloc
        return host == self.host or host == ''

    def is_url(self, url):
        scheme, netloc, path, qs, anchor = urlsplit(url)

        if url != '' and scheme in ['http', 'https', '']:
            return True
        else:
            return False
diff --git a/main.py b/main.py
 import argparse
 from crawler import Crawler

 # initializing parameters
 parser = argparse.ArgumentParser(description="Sitemap generator")
 parser.add_argument('--url', action="store", default="", help="For example https://www.focusinfotech.com")
 parser.add_argument('--exclude', action="store", default="", help="regex pattern to exclude. For example 'okay/lol' will exclude https://www.focusinfotech.com/symbol/okay/lol")
 parser.add_argument('--no-verbose', action="store_true", default="", help="print verbose output")
 parser.add_argument('--output', action="store", default="sitemap.xml", help="File path for output, if file exists it will be overwritten")

 # parsing parameters
 args = parser.parse_args()
 url = args.url.rstrip("/")

 found_links = []

 # initializeing crawler
 crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose);

 # fetch links
 links = crawler.start()


 #write into file
 with open(args.output, "w") as file: 
 	file.write('<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')

 	for link in links:
 		file.write("\n\t\t<url>\n\t\t\t<loc>\n\t\t\t\t{0}{1}/\n\t\t\t</loc>\n\t\t</url>".format(url, link))

 	file.write('</urlset>')
	import urllib.request
	from urllib.parse import urlsplit, urlunsplit, urljoin, urlparse
	import re


	class Crawler:
	def __init__(self, url, exclude=None, no_verbose=False):

	self.url = self.normalize(url)
	self.host = urlparse(self.url).netloc
	self.exclude = exclude
	self.no_verbose = no_verbose
	self.found_links = []
	self.visited_links = [self.url]

	def start(self):
	self.crawl(self.url)

	return self.found_links

	def crawl(self, url):
	if not self.no_verbose:
	print("Parsing " + url)
	try:

	response = urllib.request.urlopen(url)
	page = str(response.read())

	pattern = '<a [^>]href=[\'\|"](.?)[\'"].*?>'

	found_links = re.findall(pattern, page)
	except:
	found_links = []
	links = []

	for link in found_links:
	is_url = self.is_url(link)

	if is_url:
	is_internal = self.is_internal(link)

	if is_internal:
	self.add_url(link, links, self.exclude)
	self.add_url(link, self.found_links, self.exclude)

	for link in links:
	if link not in self.visited_links:
	link = self.normalize(link)

	self.visited_links.append(link)
	self.crawl(urljoin(self.url, link))

	def add_url(self, link, link_list, exclude_pattern=None):
	link = self.normalize(link)

	if link:
	not_in_list = link not in link_list

	excluded = False

	if exclude_pattern:
	excluded = re.search(exclude_pattern, link)

	if not_in_list and not excluded:
	link_list.append(link)

	def normalize(self, url):
	scheme, netloc, path, qs, anchor = urlsplit(url)
	return urlunsplit((scheme, netloc, path, qs, anchor))

	def is_internal(self, url):
	host = urlparse(url).netloc
	return host == self.host or host == ''

	def is_url(self, url):
	scheme, netloc, path, qs, anchor = urlsplit(url)

	if url != '' and scheme in ['http', 'https', '']:
	return True
	else:
	return False
	import argparse
	from crawler import Crawler

	# initializing parameters
	parser = argparse.ArgumentParser(description="Sitemap generator")
	parser.add_argument('--url', action="store", default="", help="For example https://www.focusinfotech.com")
	parser.add_argument('--exclude', action="store", default="", help="regex pattern to exclude. For example 'okay/lol' will exclude https://www.focusinfotech.com/symbol/okay/lol")
	parser.add_argument('--no-verbose', action="store_true", default="", help="print verbose output")
	parser.add_argument('--output', action="store", default="sitemap.xml", help="File path for output, if file exists it will be overwritten")

	# parsing parameters
	args = parser.parse_args()
	url = args.url.rstrip("/")

	found_links = []

	# initializeing crawler
	crawler = Crawler(url, exclude=args.exclude, no_verbose=args.no_verbose);

	# fetch links
	links = crawler.start()


	#write into file
	with open(args.output, "w") as file:
	file.write('<?xml version="1.0" encoding="UTF-8"?>\n\t<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')

	for link in links:
	file.write("\n\t\t<url>\n\t\t\t<loc>\n\t\t\t\t{0}{1}/\n\t\t\t</loc>\n\t\t</url>".format(url, link))

	file.write('</urlset>')