Skip to content

Instantly share code, notes, and snippets.

@simonw
Created March 6, 2025 15:36
Show Gist options
  • Save simonw/25ef58e568511ff2e63e9747fb3515e4 to your computer and use it in GitHub Desktop.
Save simonw/25ef58e568511ff2e63e9747fb3515e4 to your computer and use it in GitHub Desktop.
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "click",
# ]
# ///
import os
import json
import re
import click
def parse_file(file_path):
"""Parse a text file with content and metadata separated by a line of 5+ underscores."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Use regex to split on a line with 5 or more underscores
parts = re.split(r'\n_{5,}\n', content)
if len(parts) != 2:
# If there's no clear delimiter, return the whole file as content
return {"content": content.strip(), "file_path": file_path}
main_content = parts[0].strip()
metadata_text = parts[1].strip()
# Parse the metadata section with support for multi-line values
metadata = {}
current_key = None
for line in metadata_text.split('\n'):
# Check if this is a new key-value pair (has text before first whitespace)
parts = line.split(None, 1)
if len(parts) == 2 and not line.startswith(' '):
# This is a new key-value pair
current_key, value = parts
metadata[current_key] = value.strip()
elif current_key and line.strip() and line.startswith(' '):
# This is a continuation of the previous value
# Add the continued line, preserving the indentation structure
metadata[current_key] += ',\n' + line.strip()
# Add the content and file path
metadata["content"] = main_content
metadata["file_path"] = file_path
return metadata
except Exception as e:
print(f"Error processing {file_path}: {e}", file=sys.stderr)
return {"content": "", "file_path": file_path, "error": str(e)}
@click.command()
@click.argument('directory', type=click.Path(exists=True, file_okay=False, dir_okay=True))
def process_files(directory):
"""
Process text files in a directory recursively and output JSON objects for each file.
Each JSON object contains metadata from the file and its content.
"""
for root, _, files in os.walk(directory):
for file in files:
file_path = os.path.join(root, file)
# Process all files regardless of extension
result = parse_file(file_path)
# Output as a single line of JSON
print(json.dumps(result))
if __name__ == '__main__':
import sys
process_files()
@simonw
Copy link
Author

simonw commented Mar 6, 2025

Created with Claude: https://claude.ai/share/425edd95-87a9-4657-bd26-50921fb85508

Run like this:

git clone https://github.com/Lyrics/lyrics-database
cd lyrics-database
uv run parse.py lyrics-database | sqlite-utils insert lyrics.db lyrics - --nl --alter

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment