Skip to content

Instantly share code, notes, and snippets.

@devzorg
Created December 18, 2016 16:20
Show Gist options
  • Save devzorg/a8d09a5f50bb579e48a23fbfd6cff52c to your computer and use it in GitHub Desktop.
Save devzorg/a8d09a5f50bb579e48a23fbfd6cff52c to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# use it with argument - url to sitemap, e.g. http://example.com/sitemap.xml
# it just prints out errors, use > fielname.log to logging into the file
try:
from urllib2 import urlopen, Request, HTTPError, URLError
except ImportError:
from urllib.request import urlopen, Request, HTTPError, URLError
import signal
import threading
from bs4 import BeautifulSoup
import time
import sys
shutdown_event = None
GAME_OVER = 1
# error codes to detect
error_list = [401,403, 404, 500, 502, 503]
pages = []
bulk_size = 10
threads = 3
def build_request(url, data=None, headers={}):
headers['User-Agent'] = 'Page Cache Warmer by MageDev'
return Request(url, data=data, headers=headers)
def ctrl_c(signum, frame):
global shutdown_event
shutdown_event.set()
raise SystemExit('\nCancelling...')
def readSiteMap(url):
try:
request = build_request(url)
f = urlopen(request, timeout=3)
xml = f.read()
soup = BeautifulSoup(xml, "html.parser")
urlTags = soup.find_all("loc")
print "The number of url tags in sitemap: ", str(len(urlTags))
for sitemap in urlTags:
if hasattr(sitemap.findNext("loc"),'text'):
link = sitemap.findNext("loc").text
pages.append(link)
f.close()
except HTTPError, URLError:
print URLError.code, url
return pages
class Crawler(threading.Thread):
def __init__(self, binarySemaphore, urls):
self.binarySemaphore = binarySemaphore
self.urls = urls
threading.Thread.__init__(self)
def run(self):
try:
self.crawlUrls(self.urls)
except IOError:
print "IOError"
def crawlUrls(self, links):
for link in links:
if shutdown_event.isSet():
return GAME_OVER
try:
request = build_request(link)
f = urlopen(request)
status_code = f.code
f.close()
except HTTPError, URLError:
status_code = URLError.code
if int(status_code) in error_list:
print '['+str(self.ident)+']', str(status_code), link
return GAME_OVER
def main():
global shutdown_event, pages, threads
if not sys.argv[1]:
SystemExit("No arguments. First argument should be sitemap.xml file. e.g. http://example.com/sitemap.xml\n")
try:
binarySemaphore = threading.Semaphore(1)
shutdown_event = threading.Event()
threads = threading.activeCount() + threads
signal.signal(signal.SIGINT, ctrl_c)
urls = readSiteMap(sys.argv[1])
if not len(urls):
SystemExit('No urls was retrieved from sitemap')
i = 0
while i < len(urls):
i += bulk_size
Crawler(binarySemaphore, urls[i:i+bulk_size]).start()
while(threading.activeCount()>=threads):
time.sleep(.01)
# do nothing
print GAME_OVER
except KeyboardInterrupt:
print_('\nKeyboardInterrupt')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment