shriphani · December 16, 2015 03:09
diff --git a/nabble_heritrix_setup.py b/nabble_heritrix_setup.py
 #!/usr/bin/env python

 '''
 Our nabble crawl contains a ton of 503s. Poll heritrix and set up new jobs
 '''

 import argparse
 import daemon
 import os
 import sys
 import time


 HERITRIX_JOB_DIRECTORIES = '/bos/tmp19/spalakod/clueweb12pp/jobs/nabble/'
 NABBLE_JOB_DIR_PREFIX = 'nabble-index-pages-subforums-'
 ITERATION = 5 # 1 through 4 are taken. JUST GTFO

 sys.stderr = open('nabble-err', 'a+')

 class CrawlNotCompleteError(Exception):
 	pass

 class SeedsReportNotFoundError(Exception):
 	'''
 	Call this when the seeds-report.txt file is not found in
 	a full fs walk of the filesystem
 	'''

 def safe_mkdir(path):
 	try:
 		os.makedirs(path)
 	except OSError as exception:
 		if exception.errno != errno.EEXIST:
 			raise

 def get_seeds_report(nabble_job_dir):
 	for root, dirs, files in os.walk(nabble_job_dir):
 		if 'seeds-report.txt' in files:
 			return os.path.join(root, 'seeds-report.txt')

 	raise SeedsReportNotFoundError

 def get_503_list(nabble_job_dir):

 	nabble_job_seeds_report = get_seeds_report(nabble_job_dir)

 	with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
 		for i, new_line in enumerate(nabble_job_seeds_report_handle):
 			if i == 0:
 				continue # first line is header
 			status, crawed_or_not, seed = new_line.strip().split()

 			if status == '503':
 				yield seed

 def crawl_complete(nabble_job_dir):
 	nabble_job_seeds_report = get_seeds_report(nabble_job_dir)
 	with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
 		for i, new_line in enumerate(nabble_job_seeds_report_handle):
 			if i == 0:
 				continue
 			status, crawed_or_not, seed = new_line.strip().split()

 			if crawed_or_not != 'CRAWLED':
 				return False

 		return True

 def build_new_job_dir(iteration, crawl_config_file, seeds_list):
 	'''
 	iteration is a prefix to attach to NABBLE_JOB_DIR_PREFIX
 	'''
 	dirname = ''.join([NABBLE_JOB_DIR_PREFIX, str(iteration)])
 	abs_dirname = os.path.join(HERITRIX_JOB_DIRECTORIES, dirname)

 	safe_mkdir(abs_dirname)

 	dest_crawl_config_file = os.path.join(abs_dirname, 'crawler-beans.cxml')
 	dest_seeds_file = os.path.join(abs_dirname, 'seeds.txt')

 	seeds_added = 0

 	with open(crawl_config_file, 'r') as crawl_config_file_handle, open(dest_crawl_config_file, 'w+') as dest_crawl_config_file_handle, open(dest_seeds_file, 'w+') as dest_seeds_file_handle:
 		dest_crawl_config_file_handle.write(crawl_config_file_handle.read())

 		for seed in seeds_list:
 			dest_seeds_file_handle.write(seed + '\n')
 			seeds_added += 1

 	if seeds_added == 0:
 		sys.exit(0)# can quit now

 	return abs_dirname

 def add_path_to_heritrix(job_path):
 	'''
 	Adds the job dir specified by heritrix
 	'''
 	os.system('curl -v -d "action=add&addpath=%(path)s" -k -u admin:admin --anyauth --location https://localhost:8443/engine' % {
 			'path' : job_path
 		}
 	)

 def build_heritrix_job(job_path):
 	'''
 	Builds a heritrix job
 	'''
 	job_name = os.path.basename(job_path)

 	os.system('curl -v -d "action=build" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
 			'job_name' : job_name
 		}
 	)

 def launch_heritrix_job(job_path):
 	job_name = os.path.basename(job_path)

 	os.system('curl -v -d "action=launch" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
 			'job_name' : job_name
 		}
 	)



 if __name__ == '__main__':

 	def parse_cmdline_args():
 		parser = argparse.ArgumentParser()

 		parser.add_argument(
 			'current_nabble_job_dir', 
 			metavar = 'current-nabble-job-dir', 
 			help = 'Current nabble job directory'
 		)

 		parser.add_argument(
 			'crawl_config_file',
 			metavar = 'crawl-config-file',
 			help = 'location of crawl configuration file'
 		)

 		parser.add_argument(
 			'iteration',
 			help = 'what iteration to kick off with'
 		)

 		return parser.parse_args()

 	parsed = parse_cmdline_args()

 	current_nabble_job_dir = parsed.current_nabble_job_dir
 	ITERATION = int(parsed.iteration)

 	#with daemon.DaemonContext():
 	while True:
 		try:
 			while not crawl_complete(current_nabble_job_dir):
 				print 'Not COMPLETE'
 				time.sleep(5 * 60)
 			seeds_list = get_503_list(current_nabble_job_dir)


 			new_job_path = build_new_job_dir(ITERATION, parsed.crawl_config_file, seeds_list)
 			add_path_to_heritrix(new_job_path)
 			build_heritrix_job(new_job_path)
 			launch_heritrix_job(new_job_path)

 			current_nabble_job_dir = new_job_path
 			ITERATION += 1

 		except CrawlNotCompleteError:
 			pass

 		except SeedsReportNotFoundError:
 			pass

 		time.sleep(15 * 60) # sleep for 30 mins
	#!/usr/bin/env python

	'''
	Our nabble crawl contains a ton of 503s. Poll heritrix and set up new jobs
	'''

	import argparse
	import daemon
	import os
	import sys
	import time


	HERITRIX_JOB_DIRECTORIES = '/bos/tmp19/spalakod/clueweb12pp/jobs/nabble/'
	NABBLE_JOB_DIR_PREFIX = 'nabble-index-pages-subforums-'
	ITERATION = 5 # 1 through 4 are taken. JUST GTFO

	sys.stderr = open('nabble-err', 'a+')

	class CrawlNotCompleteError(Exception):
	pass

	class SeedsReportNotFoundError(Exception):
	'''
	Call this when the seeds-report.txt file is not found in
	a full fs walk of the filesystem
	'''

	def safe_mkdir(path):
	try:
	os.makedirs(path)
	except OSError as exception:
	if exception.errno != errno.EEXIST:
	raise

	def get_seeds_report(nabble_job_dir):
	for root, dirs, files in os.walk(nabble_job_dir):
	if 'seeds-report.txt' in files:
	return os.path.join(root, 'seeds-report.txt')

	raise SeedsReportNotFoundError

	def get_503_list(nabble_job_dir):

	nabble_job_seeds_report = get_seeds_report(nabble_job_dir)

	with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
	for i, new_line in enumerate(nabble_job_seeds_report_handle):
	if i == 0:
	continue # first line is header
	status, crawed_or_not, seed = new_line.strip().split()

	if status == '503':
	yield seed

	def crawl_complete(nabble_job_dir):
	nabble_job_seeds_report = get_seeds_report(nabble_job_dir)
	with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
	for i, new_line in enumerate(nabble_job_seeds_report_handle):
	if i == 0:
	continue
	status, crawed_or_not, seed = new_line.strip().split()

	if crawed_or_not != 'CRAWLED':
	return False

	return True

	def build_new_job_dir(iteration, crawl_config_file, seeds_list):
	'''
	iteration is a prefix to attach to NABBLE_JOB_DIR_PREFIX
	'''
	dirname = ''.join([NABBLE_JOB_DIR_PREFIX, str(iteration)])
	abs_dirname = os.path.join(HERITRIX_JOB_DIRECTORIES, dirname)

	safe_mkdir(abs_dirname)

	dest_crawl_config_file = os.path.join(abs_dirname, 'crawler-beans.cxml')
	dest_seeds_file = os.path.join(abs_dirname, 'seeds.txt')

	seeds_added = 0

	with open(crawl_config_file, 'r') as crawl_config_file_handle, open(dest_crawl_config_file, 'w+') as dest_crawl_config_file_handle, open(dest_seeds_file, 'w+') as dest_seeds_file_handle:
	dest_crawl_config_file_handle.write(crawl_config_file_handle.read())

	for seed in seeds_list:
	dest_seeds_file_handle.write(seed + '\n')
	seeds_added += 1

	if seeds_added == 0:
	sys.exit(0)# can quit now

	return abs_dirname

	def add_path_to_heritrix(job_path):
	'''
	Adds the job dir specified by heritrix
	'''
	os.system('curl -v -d "action=add&addpath=%(path)s" -k -u admin:admin --anyauth --location https://localhost:8443/engine' % {
	'path' : job_path
	}
	)

	def build_heritrix_job(job_path):
	'''
	Builds a heritrix job
	'''
	job_name = os.path.basename(job_path)

	os.system('curl -v -d "action=build" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
	'job_name' : job_name
	}
	)

	def launch_heritrix_job(job_path):
	job_name = os.path.basename(job_path)

	os.system('curl -v -d "action=launch" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
	'job_name' : job_name
	}
	)



	if __name__ == '__main__':

	def parse_cmdline_args():
	parser = argparse.ArgumentParser()

	parser.add_argument(
	'current_nabble_job_dir',
	metavar = 'current-nabble-job-dir',
	help = 'Current nabble job directory'
	)

	parser.add_argument(
	'crawl_config_file',
	metavar = 'crawl-config-file',
	help = 'location of crawl configuration file'
	)

	parser.add_argument(
	'iteration',
	help = 'what iteration to kick off with'
	)

	return parser.parse_args()

	parsed = parse_cmdline_args()

	current_nabble_job_dir = parsed.current_nabble_job_dir
	ITERATION = int(parsed.iteration)

	#with daemon.DaemonContext():
	while True:
	try:
	while not crawl_complete(current_nabble_job_dir):
	print 'Not COMPLETE'
	time.sleep(5 * 60)
	seeds_list = get_503_list(current_nabble_job_dir)


	new_job_path = build_new_job_dir(ITERATION, parsed.crawl_config_file, seeds_list)
	add_path_to_heritrix(new_job_path)
	build_heritrix_job(new_job_path)
	launch_heritrix_job(new_job_path)

	current_nabble_job_dir = new_job_path
	ITERATION += 1

	except CrawlNotCompleteError:
	pass

	except SeedsReportNotFoundError:
	pass

	time.sleep(15 * 60) # sleep for 30 mins