Skip to content

Instantly share code, notes, and snippets.

@nathanshipley
Created June 19, 2025 02:32
Show Gist options
  • Save nathanshipley/e69daba2d808e5f68ef3e8b8a3a44825 to your computer and use it in GitHub Desktop.
Save nathanshipley/e69daba2d808e5f68ef3e8b8a3a44825 to your computer and use it in GitHub Desktop.
Split large text files into smaller ones and name them based on date range seen in the file.
#!/usr/bin/env python3
import sys
import os
import re
from datetime import datetime
def parse_date_from_line(line):
"""
Given a line like "[2/22/2025 12:12 PM] s2k:", extract "2/22/2025",
parse it into a datetime.date, and return as a string "YYYY-MM-DD".
Returns None if no date is found.
"""
m = re.match(r'\[(\d{1,2}/\d{1,2}/\d{4})\s+\d{1,2}:\d{2}\s+[AP]M\]', line)
if not m:
return None
mdyy = m.group(1) # e.g. "2/22/2025"
try:
dt = datetime.strptime(mdyy, "%m/%d/%Y")
return dt.strftime("%Y-%m-%d")
except ValueError:
return None
def split_by_line_count_with_date(input_path, lines_per_file=20000):
base_name, ext = os.path.splitext(os.path.basename(input_path))
output_dir = os.path.dirname(os.path.abspath(input_path))
file_counter = 1
line_counter = 0
temp_filename = os.path.join(output_dir, f"{base_name}_temp_part{file_counter}{ext}")
current_out = open(temp_filename, 'w', encoding='utf-8')
date_from = None
date_to = None
with open(input_path, 'r', encoding='utf-8') as infile:
for line in infile:
# Each time we hit a new chunk boundary, close and rename the current file
if line_counter > 0 and line_counter % lines_per_file == 0:
current_out.close()
# Build new filename using the date range we collected
if date_from and date_to:
new_name = f"{base_name}_part{file_counter}_{date_from}_to_{date_to}{ext}"
else:
new_name = f"{base_name}_part{file_counter}_nodateinfo{ext}"
os.replace(temp_filename, os.path.join(output_dir, new_name))
# Reset for next chunk
file_counter += 1
date_from = None
date_to = None
temp_filename = os.path.join(output_dir, f"{base_name}_temp_part{file_counter}{ext}")
current_out = open(temp_filename, 'w', encoding='utf-8')
# Write the current line
current_out.write(line)
line_counter += 1
# Check if this line contains a date, and update date_from/date_to
parsed_date = parse_date_from_line(line)
if parsed_date:
if date_from is None:
date_from = parsed_date
date_to = parsed_date
# After the loop, close and rename the last chunk
if current_out:
current_out.close()
if date_from and date_to:
new_name = f"{base_name}_part{file_counter}_{date_from}_to_{date_to}{ext}"
else:
new_name = f"{base_name}_part{file_counter}_nodateinfo{ext}"
os.replace(temp_filename, os.path.join(output_dir, new_name))
print(f"Split complete: {file_counter} files created.")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python split_by_lines_with_dates.py input.txt")
sys.exit(1)
input_file = sys.argv[1]
if not os.path.isfile(input_file):
print(f"Error: File not found: {input_file}")
sys.exit(1)
split_by_line_count_with_date(input_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment