Created
June 19, 2025 02:32
-
-
Save nathanshipley/e69daba2d808e5f68ef3e8b8a3a44825 to your computer and use it in GitHub Desktop.
Split large text files into smaller ones and name them based on date range seen in the file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import os | |
import re | |
from datetime import datetime | |
def parse_date_from_line(line): | |
""" | |
Given a line like "[2/22/2025 12:12 PM] s2k:", extract "2/22/2025", | |
parse it into a datetime.date, and return as a string "YYYY-MM-DD". | |
Returns None if no date is found. | |
""" | |
m = re.match(r'\[(\d{1,2}/\d{1,2}/\d{4})\s+\d{1,2}:\d{2}\s+[AP]M\]', line) | |
if not m: | |
return None | |
mdyy = m.group(1) # e.g. "2/22/2025" | |
try: | |
dt = datetime.strptime(mdyy, "%m/%d/%Y") | |
return dt.strftime("%Y-%m-%d") | |
except ValueError: | |
return None | |
def split_by_line_count_with_date(input_path, lines_per_file=20000): | |
base_name, ext = os.path.splitext(os.path.basename(input_path)) | |
output_dir = os.path.dirname(os.path.abspath(input_path)) | |
file_counter = 1 | |
line_counter = 0 | |
temp_filename = os.path.join(output_dir, f"{base_name}_temp_part{file_counter}{ext}") | |
current_out = open(temp_filename, 'w', encoding='utf-8') | |
date_from = None | |
date_to = None | |
with open(input_path, 'r', encoding='utf-8') as infile: | |
for line in infile: | |
# Each time we hit a new chunk boundary, close and rename the current file | |
if line_counter > 0 and line_counter % lines_per_file == 0: | |
current_out.close() | |
# Build new filename using the date range we collected | |
if date_from and date_to: | |
new_name = f"{base_name}_part{file_counter}_{date_from}_to_{date_to}{ext}" | |
else: | |
new_name = f"{base_name}_part{file_counter}_nodateinfo{ext}" | |
os.replace(temp_filename, os.path.join(output_dir, new_name)) | |
# Reset for next chunk | |
file_counter += 1 | |
date_from = None | |
date_to = None | |
temp_filename = os.path.join(output_dir, f"{base_name}_temp_part{file_counter}{ext}") | |
current_out = open(temp_filename, 'w', encoding='utf-8') | |
# Write the current line | |
current_out.write(line) | |
line_counter += 1 | |
# Check if this line contains a date, and update date_from/date_to | |
parsed_date = parse_date_from_line(line) | |
if parsed_date: | |
if date_from is None: | |
date_from = parsed_date | |
date_to = parsed_date | |
# After the loop, close and rename the last chunk | |
if current_out: | |
current_out.close() | |
if date_from and date_to: | |
new_name = f"{base_name}_part{file_counter}_{date_from}_to_{date_to}{ext}" | |
else: | |
new_name = f"{base_name}_part{file_counter}_nodateinfo{ext}" | |
os.replace(temp_filename, os.path.join(output_dir, new_name)) | |
print(f"Split complete: {file_counter} files created.") | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Usage: python split_by_lines_with_dates.py input.txt") | |
sys.exit(1) | |
input_file = sys.argv[1] | |
if not os.path.isfile(input_file): | |
print(f"Error: File not found: {input_file}") | |
sys.exit(1) | |
split_by_line_count_with_date(input_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment