Skip to content

Instantly share code, notes, and snippets.

@shriphani
Last active December 16, 2015 03:09
Show Gist options
  • Save shriphani/5368123 to your computer and use it in GitHub Desktop.
Save shriphani/5368123 to your computer and use it in GitHub Desktop.
Performs a full crawl of nabble iteratively crawling the sites we previously timed out on
#!/usr/bin/env python
'''
Our nabble crawl contains a ton of 503s. Poll heritrix and set up new jobs
'''
import argparse
import daemon
import os
import sys
import time
HERITRIX_JOB_DIRECTORIES = '/bos/tmp19/spalakod/clueweb12pp/jobs/nabble/'
NABBLE_JOB_DIR_PREFIX = 'nabble-index-pages-subforums-'
ITERATION = 5 # 1 through 4 are taken. JUST GTFO
sys.stderr = open('nabble-err', 'a+')
class CrawlNotCompleteError(Exception):
pass
class SeedsReportNotFoundError(Exception):
'''
Call this when the seeds-report.txt file is not found in
a full fs walk of the filesystem
'''
def safe_mkdir(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def get_seeds_report(nabble_job_dir):
for root, dirs, files in os.walk(nabble_job_dir):
if 'seeds-report.txt' in files:
return os.path.join(root, 'seeds-report.txt')
raise SeedsReportNotFoundError
def get_503_list(nabble_job_dir):
nabble_job_seeds_report = get_seeds_report(nabble_job_dir)
with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
for i, new_line in enumerate(nabble_job_seeds_report_handle):
if i == 0:
continue # first line is header
status, crawed_or_not, seed = new_line.strip().split()
if status == '503':
yield seed
def crawl_complete(nabble_job_dir):
nabble_job_seeds_report = get_seeds_report(nabble_job_dir)
with open(nabble_job_seeds_report, 'r') as nabble_job_seeds_report_handle:
for i, new_line in enumerate(nabble_job_seeds_report_handle):
if i == 0:
continue
status, crawed_or_not, seed = new_line.strip().split()
if crawed_or_not != 'CRAWLED':
return False
return True
def build_new_job_dir(iteration, crawl_config_file, seeds_list):
'''
iteration is a prefix to attach to NABBLE_JOB_DIR_PREFIX
'''
dirname = ''.join([NABBLE_JOB_DIR_PREFIX, str(iteration)])
abs_dirname = os.path.join(HERITRIX_JOB_DIRECTORIES, dirname)
safe_mkdir(abs_dirname)
dest_crawl_config_file = os.path.join(abs_dirname, 'crawler-beans.cxml')
dest_seeds_file = os.path.join(abs_dirname, 'seeds.txt')
seeds_added = 0
with open(crawl_config_file, 'r') as crawl_config_file_handle, open(dest_crawl_config_file, 'w+') as dest_crawl_config_file_handle, open(dest_seeds_file, 'w+') as dest_seeds_file_handle:
dest_crawl_config_file_handle.write(crawl_config_file_handle.read())
for seed in seeds_list:
dest_seeds_file_handle.write(seed + '\n')
seeds_added += 1
if seeds_added == 0:
sys.exit(0)# can quit now
return abs_dirname
def add_path_to_heritrix(job_path):
'''
Adds the job dir specified by heritrix
'''
os.system('curl -v -d "action=add&addpath=%(path)s" -k -u admin:admin --anyauth --location https://localhost:8443/engine' % {
'path' : job_path
}
)
def build_heritrix_job(job_path):
'''
Builds a heritrix job
'''
job_name = os.path.basename(job_path)
os.system('curl -v -d "action=build" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
'job_name' : job_name
}
)
def launch_heritrix_job(job_path):
job_name = os.path.basename(job_path)
os.system('curl -v -d "action=launch" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s' % {
'job_name' : job_name
}
)
if __name__ == '__main__':
def parse_cmdline_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'current_nabble_job_dir',
metavar = 'current-nabble-job-dir',
help = 'Current nabble job directory'
)
parser.add_argument(
'crawl_config_file',
metavar = 'crawl-config-file',
help = 'location of crawl configuration file'
)
parser.add_argument(
'iteration',
help = 'what iteration to kick off with'
)
return parser.parse_args()
parsed = parse_cmdline_args()
current_nabble_job_dir = parsed.current_nabble_job_dir
ITERATION = int(parsed.iteration)
#with daemon.DaemonContext():
while True:
try:
while not crawl_complete(current_nabble_job_dir):
print 'Not COMPLETE'
time.sleep(5 * 60)
seeds_list = get_503_list(current_nabble_job_dir)
new_job_path = build_new_job_dir(ITERATION, parsed.crawl_config_file, seeds_list)
add_path_to_heritrix(new_job_path)
build_heritrix_job(new_job_path)
launch_heritrix_job(new_job_path)
current_nabble_job_dir = new_job_path
ITERATION += 1
except CrawlNotCompleteError:
pass
except SeedsReportNotFoundError:
pass
time.sleep(15 * 60) # sleep for 30 mins
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment