Created
April 24, 2013 18:05
-
-
Save shriphani/5454168 to your computer and use it in GitHub Desktop.
ygroups heritrix crawl setup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| The purpose of this script is to keep pausing / unpausing | |
| the ygroups download | |
| ''' | |
| import argparse | |
| import os | |
| import sys | |
| import time | |
| INTERVAL_TIME = 5*60 | |
| PER_INTERVAL_SIZE = 100 # number of links we want to grab between intervals | |
| HERITRIX_OUTPUT_FILE = 'heritrix.output.log' | |
| def read_crawl_log(ygroups_job_dir): | |
| with open(os.path.join(ygroups_job_dir, 'latest/logs/crawl.log'), 'r') as crawl_log_handle: | |
| print 'Opened crawl log file: ' | |
| num_lines = 0 | |
| for new_line in crawl_log_handle: | |
| num_lines += 1 | |
| return num_lines | |
| def pause_and_unpause_ygroups(ygroups_job_dir): | |
| job_name = os.path.basename(ygroups_job_dir) | |
| start_num_lines = read_crawl_log(ygroups_job_dir) | |
| end_num_lines = read_crawl_log(ygroups_job_dir) | |
| print 'Kick off with: ', start_num_lines | |
| sys.stdout.flush() | |
| while end_num_lines - start_num_lines < PER_INTERVAL_SIZE: | |
| print 'Currently downloaded: ', end_num_lines | |
| sys.stdout.flush() | |
| end_num_lines = read_crawl_log(ygroups_job_dir) | |
| # shot over. time to pause | |
| os.system('curl -v -d "action=pause" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s &> %(heritrix_output_file)s' % { | |
| 'job_name' : job_name, | |
| 'heritrix_output_file' : HERITRIX_OUTPUT_FILE | |
| } | |
| ) | |
| time.sleep(INTERVAL_TIME) | |
| # slept long enough, unpause | |
| os.system('curl -v -d "action=unpause" -k -u admin:admin --anyauth --location https://localhost:8443/engine/job/%(job_name)s &> %(heritrix_output_file)s' % { | |
| 'job_name' : job_name, | |
| 'heritrix_output_file' : HERITRIX_OUTPUT_FILE | |
| } | |
| ) | |
| if __name__ == '__main__': | |
| def parse_cmdline_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| 'ygroups_job_dir', | |
| metavar = 'ygroups-job-dir', | |
| help = 'YGroups job dir' | |
| ) | |
| return parser.parse_args() | |
| parsed = parse_cmdline_args() | |
| while True: | |
| pause_and_unpause_ygroups(parsed.ygroups_job_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment