Skip to content

Instantly share code, notes, and snippets.

@imaurer
Created May 21, 2025 14:01
Show Gist options
  • Save imaurer/cd4bed5b66f6561b05b0a16dfa11c2e0 to your computer and use it in GitHub Desktop.
Save imaurer/cd4bed5b66f6561b05b0a16dfa11c2e0 to your computer and use it in GitHub Desktop.
Fix ChatGPT hyphens and spaces
#!/usr/bin/env -S uv --quiet run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "typer",
# ]
# ///
"""
Clean Text: Unicode Hyphen and Space Normalizer
This utility normalizes "bad" Unicode hyphens and non-breaking spaces in text files.
Purpose:
- Converts various Unicode hyphens (U+00AD, U+2010-U+2015, U+2212) to standard ASCII hyphens (-)
- Converts various non-breaking spaces (U+00A0, U+202F, U+2007) to standard spaces
- Scans directories recursively for specified file types
- Reports only files that were actually changed
- Shows detailed statistics about replaced characters
Usage:
clean_text.py [OPTIONS] PATHS...
PATHS: One or more directories to search for files
Options:
-e, --ext TEXT File extensions to process (defaults to py, md, markdown)
--help Show help message and exit
Examples:
# Process Python and Markdown files in multiple directories
clean_text.py ./docs/ ./src/ ./tests/
# Process specific file types in a single directory
clean_text.py ./content/ -e html -e txt -e rst
Exit Status:
0 - Success (files processed without errors)
1 - Error (invalid path or file operations failed)
Author: Ian Maurer
Version: 0.1
License: MIT
"""
import re
from pathlib import Path
from typing import List
import typer
def main(
paths: List[str] = typer.Argument(..., help="Directories to search for files"),
extensions: List[str] = typer.Option(
["py", "md", "markdown"], "--ext", "-e", help="File extensions to process"
),
):
"""
Clean up bad hyphens and non-breaking spaces in text files.
Replaces Unicode hyphens with standard ASCII hyphens.
Replaces non-breaking spaces with regular spaces.
Only reports files that were actually changed.
"""
# Compile regex patterns
hyphen_pattern = re.compile(r"[\u00AD\u2010-\u2015\u2212]")
space_pattern = re.compile(r"[\u00A0\u202F\u2007]")
# Set of valid extensions
valid_exts = set(f".{ext}" for ext in extensions)
# Count of total files changed
total_files_changed = 0
# Process each directory
for path_str in paths:
path = Path(path_str)
if not path.exists():
typer.echo(f"Warning: Path '{path}' does not exist. Skipping.", err=True)
continue
# Find all files with the specified extensions
for file_path in path.glob("**/*"):
if file_path.is_file() and file_path.suffix in valid_exts:
# Read the file
try:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
except Exception as e:
typer.echo(f"Error reading {file_path}: {e}", err=True)
continue
# Apply replacements
new_content, hyphen_count = hyphen_pattern.subn("-", content)
new_content, space_count = space_pattern.subn(" ", new_content)
# Total changes
total_changes = hyphen_count + space_count
# If changes were made, write back to file
if total_changes > 0:
try:
with open(file_path, "w", encoding="utf-8") as f:
f.write(new_content)
typer.echo(
f"Modified: {file_path} ({total_changes} replacements: {hyphen_count} hyphens, {space_count} spaces)"
)
total_files_changed += 1
except Exception as e:
typer.echo(f"Error writing {file_path}: {e}", err=True)
# Print summary
if total_files_changed > 0:
typer.echo(f"\nTotal files modified: {total_files_changed}")
else:
typer.echo("No files were modified.")
if __name__ == "__main__":
typer.run(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment