hn-support · May 21, 2025 03:37 · tdgroot · Sep 23, 2019 · hongxy · May 6, 2020
diff --git a/cache-warmer.py b/cache-warmer.py
 #!/usr/bin/env python
 """
 Warm the caches of your website by crawling each page defined in sitemap.xml.
 To use, download this file and make it executable. Then run:
 ./cache-warmer.py --threads 4 --file /data/web/public/sitemap.xml -v
 """
 import argparse
 import multiprocessing.pool as mpool
 import os.path
 import re
 import sys
 import time
 import requests
 import subprocess


 results = []
 start = time.time()


 def parse_options():
    parser = argparse.ArgumentParser(description="""Cache crawler based on a sitemap.xml file""")
    parser.add_argument('-t', '--threads', help='How many threads to use', default=10, required=False, type=int)
    parser.add_argument('-f', '--file', help='The sitemap xml file', required=True, type=str)
    parser.add_argument('-v', '--verbose', help='Be more verbose', action='store_true', default=False)

    args = parser.parse_args()
    if not os.path.isfile(args.file):
        parser.error('Could not find sitemap file %s' % args.file)
    return args


 def crawl_url(url, verbose=False):
    if verbose:
        print("Crawling {}".format(url))
    a = requests.get(url, headers={"user-agent": "SitemapCacheWarmer"})
    return {'exit': 0 if a.ok else 1, 'out': a.text, 'url': url}


 def make_results():
    errcount = 0
    exec_time = format(time.time() - start, '.4f')
    for item in results:
        if item['exit'] == 0:
            continue
        else:
            errcount += 1
            print("Errors detected in %s:\n%s\n" % (item['url'], item['out']))
            print("=" * 50)
    if errcount == 0:
        print("All DONE! - All urls are warmed! - done in %s " % exec_time)
        return 0
    else:
        print("%d Errors detected! - done in %ss" % (errcount, exec_time))
        return 1


 def get_sitemap_urls(filename):
    with open(filename) as fh:
        return re.findall('<loc>(.*?)</loc>?', fh.read())


 def callback(output):
    results.append(output)


 def main():
    args = parse_options()
    sitemap_urls = get_sitemap_urls(args.file)

    if args.verbose:
        print("Crawling {} urls with {} threads\n[Please Wait!]".format(len(sitemap_urls), args.threads))
        print("=" * 50)

    pool = mpool.ThreadPool(args.threads)
    for url in sitemap_urls:
        pool.apply_async(crawl_url, args=(url,), callback=callback)
    pool.close()
    pool.join()
    sys.exit(make_results())


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""
	Warm the caches of your website by crawling each page defined in sitemap.xml.
	To use, download this file and make it executable. Then run:
	./cache-warmer.py --threads 4 --file /data/web/public/sitemap.xml -v
	"""
	import argparse
	import multiprocessing.pool as mpool
	import os.path
	import re
	import sys
	import time
	import requests
	import subprocess


	results = []
	start = time.time()


	def parse_options():
	parser = argparse.ArgumentParser(description="""Cache crawler based on a sitemap.xml file""")
	parser.add_argument('-t', '--threads', help='How many threads to use', default=10, required=False, type=int)
	parser.add_argument('-f', '--file', help='The sitemap xml file', required=True, type=str)
	parser.add_argument('-v', '--verbose', help='Be more verbose', action='store_true', default=False)

	args = parser.parse_args()
	if not os.path.isfile(args.file):
	parser.error('Could not find sitemap file %s' % args.file)
	return args


	def crawl_url(url, verbose=False):
	if verbose:
	print("Crawling {}".format(url))
	a = requests.get(url, headers={"user-agent": "SitemapCacheWarmer"})
	return {'exit': 0 if a.ok else 1, 'out': a.text, 'url': url}


	def make_results():
	errcount = 0
	exec_time = format(time.time() - start, '.4f')
	for item in results:
	if item['exit'] == 0:
	continue
	else:
	errcount += 1
	print("Errors detected in %s:\n%s\n" % (item['url'], item['out']))
	print("=" * 50)
	if errcount == 0:
	print("All DONE! - All urls are warmed! - done in %s " % exec_time)
	return 0
	else:
	print("%d Errors detected! - done in %ss" % (errcount, exec_time))
	return 1


	def get_sitemap_urls(filename):
	with open(filename) as fh:
	return re.findall('<loc>(.*?)</loc>?', fh.read())


	def callback(output):
	results.append(output)


	def main():
	args = parse_options()
	sitemap_urls = get_sitemap_urls(args.file)

	if args.verbose:
	print("Crawling {} urls with {} threads\n[Please Wait!]".format(len(sitemap_urls), args.threads))
	print("=" * 50)

	pool = mpool.ThreadPool(args.threads)
	for url in sitemap_urls:
	pool.apply_async(crawl_url, args=(url,), callback=callback)
	pool.close()
	pool.join()
	sys.exit(make_results())


	if __name__ == "__main__":
	main()
No results found