Created
May 21, 2025 14:01
-
-
Save imaurer/cd4bed5b66f6561b05b0a16dfa11c2e0 to your computer and use it in GitHub Desktop.
Fix ChatGPT hyphens and spaces
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env -S uv --quiet run --script | |
# /// script | |
# requires-python = ">=3.11" | |
# dependencies = [ | |
# "typer", | |
# ] | |
# /// | |
""" | |
Clean Text: Unicode Hyphen and Space Normalizer | |
This utility normalizes "bad" Unicode hyphens and non-breaking spaces in text files. | |
Purpose: | |
- Converts various Unicode hyphens (U+00AD, U+2010-U+2015, U+2212) to standard ASCII hyphens (-) | |
- Converts various non-breaking spaces (U+00A0, U+202F, U+2007) to standard spaces | |
- Scans directories recursively for specified file types | |
- Reports only files that were actually changed | |
- Shows detailed statistics about replaced characters | |
Usage: | |
clean_text.py [OPTIONS] PATHS... | |
PATHS: One or more directories to search for files | |
Options: | |
-e, --ext TEXT File extensions to process (defaults to py, md, markdown) | |
--help Show help message and exit | |
Examples: | |
# Process Python and Markdown files in multiple directories | |
clean_text.py ./docs/ ./src/ ./tests/ | |
# Process specific file types in a single directory | |
clean_text.py ./content/ -e html -e txt -e rst | |
Exit Status: | |
0 - Success (files processed without errors) | |
1 - Error (invalid path or file operations failed) | |
Author: Ian Maurer | |
Version: 0.1 | |
License: MIT | |
""" | |
import re | |
from pathlib import Path | |
from typing import List | |
import typer | |
def main( | |
paths: List[str] = typer.Argument(..., help="Directories to search for files"), | |
extensions: List[str] = typer.Option( | |
["py", "md", "markdown"], "--ext", "-e", help="File extensions to process" | |
), | |
): | |
""" | |
Clean up bad hyphens and non-breaking spaces in text files. | |
Replaces Unicode hyphens with standard ASCII hyphens. | |
Replaces non-breaking spaces with regular spaces. | |
Only reports files that were actually changed. | |
""" | |
# Compile regex patterns | |
hyphen_pattern = re.compile(r"[\u00AD\u2010-\u2015\u2212]") | |
space_pattern = re.compile(r"[\u00A0\u202F\u2007]") | |
# Set of valid extensions | |
valid_exts = set(f".{ext}" for ext in extensions) | |
# Count of total files changed | |
total_files_changed = 0 | |
# Process each directory | |
for path_str in paths: | |
path = Path(path_str) | |
if not path.exists(): | |
typer.echo(f"Warning: Path '{path}' does not exist. Skipping.", err=True) | |
continue | |
# Find all files with the specified extensions | |
for file_path in path.glob("**/*"): | |
if file_path.is_file() and file_path.suffix in valid_exts: | |
# Read the file | |
try: | |
with open(file_path, "r", encoding="utf-8") as f: | |
content = f.read() | |
except Exception as e: | |
typer.echo(f"Error reading {file_path}: {e}", err=True) | |
continue | |
# Apply replacements | |
new_content, hyphen_count = hyphen_pattern.subn("-", content) | |
new_content, space_count = space_pattern.subn(" ", new_content) | |
# Total changes | |
total_changes = hyphen_count + space_count | |
# If changes were made, write back to file | |
if total_changes > 0: | |
try: | |
with open(file_path, "w", encoding="utf-8") as f: | |
f.write(new_content) | |
typer.echo( | |
f"Modified: {file_path} ({total_changes} replacements: {hyphen_count} hyphens, {space_count} spaces)" | |
) | |
total_files_changed += 1 | |
except Exception as e: | |
typer.echo(f"Error writing {file_path}: {e}", err=True) | |
# Print summary | |
if total_files_changed > 0: | |
typer.echo(f"\nTotal files modified: {total_files_changed}") | |
else: | |
typer.echo("No files were modified.") | |
if __name__ == "__main__": | |
typer.run(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment