Skip to content

Instantly share code, notes, and snippets.

@angusdev
Created September 18, 2024 20:03
Show Gist options
  • Save angusdev/6ae0ca7066d073711fb0620ef4b9debd to your computer and use it in GitHub Desktop.
Save angusdev/6ae0ca7066d073711fb0620ef4b9debd to your computer and use it in GitHub Desktop.
import re
from datetime import datetime
import pandas as pd
import argparse
def parse_log_files(file_names):
job_data = {}
job_pattern = re.compile(r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (Thread-\d+) MYJOB (started|ended) (\d+) (.+)')
for file_name in file_names:
with open(file_name, 'r') as file:
for line in file:
match = job_pattern.search(line)
if match:
timestamp = datetime.strptime(match.group(1), '%Y-%m-%d %H:%M:%S')
thread_name = match.group(2)
status = match.group(3)
job_id = match.group(4)
job_name = match.group(5)
if job_id not in job_data:
job_data[job_id] = {'job_name': job_name, 'details': {}}
if status == 'started':
job_data[job_id]['details'][file_name] = {
'start_time': timestamp,
'thread_name': thread_name
}
elif status == 'ended':
if file_name in job_data[job_id]['details']:
start_time = job_data[job_id]['details'][file_name]['start_time']
duration = (timestamp - start_time).total_seconds()
job_data[job_id]['details'][file_name]['duration'] = duration
else:
job_data[job_id]['details'][file_name] = {
'thread_name': thread_name,
'duration': None
}
return job_data
def generate_excel(job_data, output_file):
# Prepare the data for Excel
job_list = []
for job_id, details in job_data.items():
job_entry = {'JOB_ID': job_id, 'JOB_NAME': details['job_name']}
for file_name, file_details in details['details'].items():
job_entry[f'THREAD_NAME({file_name})'] = file_details['thread_name']
job_entry[f'DURATION({file_name})'] = file_details.get('duration')
job_list.append(job_entry)
# Convert to DataFrame
df = pd.DataFrame(job_list)
# Write to Excel
df.to_excel(output_file, index=False)
def main():
# Parse command-line arguments
parser = argparse.ArgumentParser(description='Parse log files and output job durations to an Excel file.')
parser.add_argument('input_files', nargs='+', help='Input log file names')
parser.add_argument('--output', default='job_durations.xlsx', help='Output Excel file name')
args = parser.parse_args()
# Parse log files and generate Excel
job_data = parse_log_files(args.input_files)
generate_excel(job_data, args.output)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment