Skip to content

Instantly share code, notes, and snippets.

@shriphani
Created April 24, 2013 18:05
Show Gist options
  • Save shriphani/5454168 to your computer and use it in GitHub Desktop.
Save shriphani/5454168 to your computer and use it in GitHub Desktop.
ygroups heritrix crawl setup
'''
The purpose of this script is to keep pausing / unpausing
the ygroups download
'''
import argparse
import os
import sys
import time
INTERVAL_TIME = 5*60
PER_INTERVAL_SIZE = 100 # number of links we want to grab between intervals
HERITRIX_OUTPUT_FILE = 'heritrix.output.log'
def read_crawl_log(ygroups_job_dir):
with open(os.path.join(ygroups_job_dir, 'latest/logs/crawl.log'), 'r') as crawl_log_handle:
print 'Opened crawl log file: '
num_lines = 0
for new_line in crawl_log_handle:
num_lines += 1
return num_lines
def pause_and_unpause_ygroups(ygroups_job_dir):
job_name = os.path.basename(ygroups_job_dir)
start_num_lines = read_crawl_log(ygroups_job_dir)
end_num_lines = read_crawl_log(ygroups_job_dir)
print 'Kick off with: ', start_num_lines
sys.stdout.flush()
while end_num_lines - start_num_lines < PER_INTERVAL_SIZE:
print 'Currently downloaded: ', end_num_lines
sys.stdout.flush()
end_num_lines = read_crawl_log(ygroups_job_dir)
# shot over. time to pause
os.system('curl -v -d "action=pause" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s &> %(heritrix_output_file)s' % {
'job_name' : job_name,
'heritrix_output_file' : HERITRIX_OUTPUT_FILE
}
)
time.sleep(INTERVAL_TIME)
# slept long enough, unpause
os.system('curl -v -d "action=unpause" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s &> %(heritrix_output_file)s' % {
'job_name' : job_name,
'heritrix_output_file' : HERITRIX_OUTPUT_FILE
}
)
if __name__ == '__main__':
def parse_cmdline_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'ygroups_job_dir',
metavar = 'ygroups-job-dir',
help = 'YGroups job dir'
)
return parser.parse_args()
parsed = parse_cmdline_args()
while True:
pause_and_unpause_ygroups(parsed.ygroups_job_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment