devzorg · December 18, 2016 16:20
diff --git a/crawler.py b/crawler.py
 #!/usr/bin/python
 # use it with argument - url to sitemap, e.g. http://example.com/sitemap.xml
 # it just prints out errors, use > fielname.log to logging into the file
 try:
    from urllib2 import urlopen, Request, HTTPError, URLError
 except ImportError:
    from urllib.request import urlopen, Request, HTTPError, URLError

 import signal
 import threading
 from bs4 import BeautifulSoup
 import time
 import sys

 shutdown_event = None
 GAME_OVER = 1
 # error codes to detect
 error_list = [401,403, 404, 500, 502, 503]
 pages = []
 bulk_size = 10
 threads = 3

 def build_request(url, data=None, headers={}):
    headers['User-Agent'] = 'Page Cache Warmer by MageDev'
    return Request(url, data=data, headers=headers)


 def ctrl_c(signum, frame):
    global shutdown_event
    shutdown_event.set()
    raise SystemExit('\nCancelling...')


 def readSiteMap(url):
    try:
        request = build_request(url)
        f = urlopen(request, timeout=3)
        xml = f.read()
        soup = BeautifulSoup(xml, "html.parser")
        urlTags = soup.find_all("loc")
        print "The number of url tags in sitemap: ", str(len(urlTags))
        for sitemap in urlTags:
            if hasattr(sitemap.findNext("loc"),'text'):
                link = sitemap.findNext("loc").text
                pages.append(link)

        f.close()
    except HTTPError, URLError:
        print URLError.code, url

    return pages

 class Crawler(threading.Thread):
    def __init__(self, binarySemaphore, urls):
        self.binarySemaphore = binarySemaphore
        self.urls = urls
        threading.Thread.__init__(self)

    def run(self):
        try:
            self.crawlUrls(self.urls)

        except IOError:
            print "IOError"

    def crawlUrls(self, links):
        for link in links:
            if shutdown_event.isSet():
                return GAME_OVER

            try:
                request = build_request(link)
                f = urlopen(request)
                status_code = f.code
                f.close()
            except HTTPError, URLError:
                status_code = URLError.code

            if int(status_code) in error_list:
                print '['+str(self.ident)+']', str(status_code), link

        return GAME_OVER


 def main():
    global shutdown_event, pages, threads

    if not sys.argv[1]:
        SystemExit("No arguments. First argument should be sitemap.xml file. e.g. http://example.com/sitemap.xml\n")

    try:
        binarySemaphore = threading.Semaphore(1)
        shutdown_event = threading.Event()

        threads = threading.activeCount() + threads

        signal.signal(signal.SIGINT, ctrl_c)
        urls = readSiteMap(sys.argv[1])

        if not len(urls):
            SystemExit('No urls was retrieved from sitemap')

        i = 0

        while i < len(urls):
            i += bulk_size
            Crawler(binarySemaphore, urls[i:i+bulk_size]).start()

            while(threading.activeCount()>=threads):
                time.sleep(.01)
                # do nothing

        print GAME_OVER
    except KeyboardInterrupt:
        print_('\nKeyboardInterrupt')

 if __name__ == '__main__':
    main()
	#!/usr/bin/python
	# use it with argument - url to sitemap, e.g. http://example.com/sitemap.xml
	# it just prints out errors, use > fielname.log to logging into the file
	try:
	from urllib2 import urlopen, Request, HTTPError, URLError
	except ImportError:
	from urllib.request import urlopen, Request, HTTPError, URLError

	import signal
	import threading
	from bs4 import BeautifulSoup
	import time
	import sys

	shutdown_event = None
	GAME_OVER = 1
	# error codes to detect
	error_list = [401,403, 404, 500, 502, 503]
	pages = []
	bulk_size = 10
	threads = 3

	def build_request(url, data=None, headers={}):
	headers['User-Agent'] = 'Page Cache Warmer by MageDev'
	return Request(url, data=data, headers=headers)


	def ctrl_c(signum, frame):
	global shutdown_event
	shutdown_event.set()
	raise SystemExit('\nCancelling...')


	def readSiteMap(url):
	try:
	request = build_request(url)
	f = urlopen(request, timeout=3)
	xml = f.read()
	soup = BeautifulSoup(xml, "html.parser")
	urlTags = soup.find_all("loc")
	print "The number of url tags in sitemap: ", str(len(urlTags))
	for sitemap in urlTags:
	if hasattr(sitemap.findNext("loc"),'text'):
	link = sitemap.findNext("loc").text
	pages.append(link)

	f.close()
	except HTTPError, URLError:
	print URLError.code, url

	return pages

	class Crawler(threading.Thread):
	def __init__(self, binarySemaphore, urls):
	self.binarySemaphore = binarySemaphore
	self.urls = urls
	threading.Thread.__init__(self)

	def run(self):
	try:
	self.crawlUrls(self.urls)

	except IOError:
	print "IOError"

	def crawlUrls(self, links):
	for link in links:
	if shutdown_event.isSet():
	return GAME_OVER

	try:
	request = build_request(link)
	f = urlopen(request)
	status_code = f.code
	f.close()
	except HTTPError, URLError:
	status_code = URLError.code

	if int(status_code) in error_list:
	print '['+str(self.ident)+']', str(status_code), link

	return GAME_OVER


	def main():
	global shutdown_event, pages, threads

	if not sys.argv[1]:
	SystemExit("No arguments. First argument should be sitemap.xml file. e.g. http://example.com/sitemap.xml\n")

	try:
	binarySemaphore = threading.Semaphore(1)
	shutdown_event = threading.Event()

	threads = threading.activeCount() + threads

	signal.signal(signal.SIGINT, ctrl_c)
	urls = readSiteMap(sys.argv[1])

	if not len(urls):
	SystemExit('No urls was retrieved from sitemap')

	i = 0

	while i < len(urls):
	i += bulk_size
	Crawler(binarySemaphore, urls[i:i+bulk_size]).start()

	while(threading.activeCount()>=threads):
	time.sleep(.01)
	# do nothing

	print GAME_OVER
	except KeyboardInterrupt:
	print_('\nKeyboardInterrupt')

	if __name__ == '__main__':
	main()