Created
March 6, 2025 15:36
-
-
Save simonw/25ef58e568511ff2e63e9747fb3515e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /// script | |
# requires-python = ">=3.12" | |
# dependencies = [ | |
# "click", | |
# ] | |
# /// | |
import os | |
import json | |
import re | |
import click | |
def parse_file(file_path): | |
"""Parse a text file with content and metadata separated by a line of 5+ underscores.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
# Use regex to split on a line with 5 or more underscores | |
parts = re.split(r'\n_{5,}\n', content) | |
if len(parts) != 2: | |
# If there's no clear delimiter, return the whole file as content | |
return {"content": content.strip(), "file_path": file_path} | |
main_content = parts[0].strip() | |
metadata_text = parts[1].strip() | |
# Parse the metadata section with support for multi-line values | |
metadata = {} | |
current_key = None | |
for line in metadata_text.split('\n'): | |
# Check if this is a new key-value pair (has text before first whitespace) | |
parts = line.split(None, 1) | |
if len(parts) == 2 and not line.startswith(' '): | |
# This is a new key-value pair | |
current_key, value = parts | |
metadata[current_key] = value.strip() | |
elif current_key and line.strip() and line.startswith(' '): | |
# This is a continuation of the previous value | |
# Add the continued line, preserving the indentation structure | |
metadata[current_key] += ',\n' + line.strip() | |
# Add the content and file path | |
metadata["content"] = main_content | |
metadata["file_path"] = file_path | |
return metadata | |
except Exception as e: | |
print(f"Error processing {file_path}: {e}", file=sys.stderr) | |
return {"content": "", "file_path": file_path, "error": str(e)} | |
@click.command() | |
@click.argument('directory', type=click.Path(exists=True, file_okay=False, dir_okay=True)) | |
def process_files(directory): | |
""" | |
Process text files in a directory recursively and output JSON objects for each file. | |
Each JSON object contains metadata from the file and its content. | |
""" | |
for root, _, files in os.walk(directory): | |
for file in files: | |
file_path = os.path.join(root, file) | |
# Process all files regardless of extension | |
result = parse_file(file_path) | |
# Output as a single line of JSON | |
print(json.dumps(result)) | |
if __name__ == '__main__': | |
import sys | |
process_files() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Created with Claude: https://claude.ai/share/425edd95-87a9-4657-bd26-50921fb85508
Run like this: