Created
December 18, 2016 16:20
-
-
Save devzorg/a8d09a5f50bb579e48a23fbfd6cff52c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# use it with argument - url to sitemap, e.g. http://example.com/sitemap.xml | |
# it just prints out errors, use > fielname.log to logging into the file | |
try: | |
from urllib2 import urlopen, Request, HTTPError, URLError | |
except ImportError: | |
from urllib.request import urlopen, Request, HTTPError, URLError | |
import signal | |
import threading | |
from bs4 import BeautifulSoup | |
import time | |
import sys | |
shutdown_event = None | |
GAME_OVER = 1 | |
# error codes to detect | |
error_list = [401,403, 404, 500, 502, 503] | |
pages = [] | |
bulk_size = 10 | |
threads = 3 | |
def build_request(url, data=None, headers={}): | |
headers['User-Agent'] = 'Page Cache Warmer by MageDev' | |
return Request(url, data=data, headers=headers) | |
def ctrl_c(signum, frame): | |
global shutdown_event | |
shutdown_event.set() | |
raise SystemExit('\nCancelling...') | |
def readSiteMap(url): | |
try: | |
request = build_request(url) | |
f = urlopen(request, timeout=3) | |
xml = f.read() | |
soup = BeautifulSoup(xml, "html.parser") | |
urlTags = soup.find_all("loc") | |
print "The number of url tags in sitemap: ", str(len(urlTags)) | |
for sitemap in urlTags: | |
if hasattr(sitemap.findNext("loc"),'text'): | |
link = sitemap.findNext("loc").text | |
pages.append(link) | |
f.close() | |
except HTTPError, URLError: | |
print URLError.code, url | |
return pages | |
class Crawler(threading.Thread): | |
def __init__(self, binarySemaphore, urls): | |
self.binarySemaphore = binarySemaphore | |
self.urls = urls | |
threading.Thread.__init__(self) | |
def run(self): | |
try: | |
self.crawlUrls(self.urls) | |
except IOError: | |
print "IOError" | |
def crawlUrls(self, links): | |
for link in links: | |
if shutdown_event.isSet(): | |
return GAME_OVER | |
try: | |
request = build_request(link) | |
f = urlopen(request) | |
status_code = f.code | |
f.close() | |
except HTTPError, URLError: | |
status_code = URLError.code | |
if int(status_code) in error_list: | |
print '['+str(self.ident)+']', str(status_code), link | |
return GAME_OVER | |
def main(): | |
global shutdown_event, pages, threads | |
if not sys.argv[1]: | |
SystemExit("No arguments. First argument should be sitemap.xml file. e.g. http://example.com/sitemap.xml\n") | |
try: | |
binarySemaphore = threading.Semaphore(1) | |
shutdown_event = threading.Event() | |
threads = threading.activeCount() + threads | |
signal.signal(signal.SIGINT, ctrl_c) | |
urls = readSiteMap(sys.argv[1]) | |
if not len(urls): | |
SystemExit('No urls was retrieved from sitemap') | |
i = 0 | |
while i < len(urls): | |
i += bulk_size | |
Crawler(binarySemaphore, urls[i:i+bulk_size]).start() | |
while(threading.activeCount()>=threads): | |
time.sleep(.01) | |
# do nothing | |
print GAME_OVER | |
except KeyboardInterrupt: | |
print_('\nKeyboardInterrupt') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment