Skip to content

Instantly share code, notes, and snippets.

@yeiichi
Last active April 14, 2025 07:05
Show Gist options
  • Save yeiichi/f10931442f0ac657fc3ef2bcebddfbc8 to your computer and use it in GitHub Desktop.
Save yeiichi/f10931442f0ac657fc3ef2bcebddfbc8 to your computer and use it in GitHub Desktop.
Normalizes the textual content of plain text or CSV files into NFKC format.
#!/usr/bin/env python3
"""Normalizes the textual content of text or CSV files into NFKC format."""
import argparse
import csv
import subprocess
from pathlib import Path
from unicodedata import normalize
# Constants
NORMALIZATION_FORM = "NFKC"
COLOR_Y = "\033[93m"
COLOR_0 = "\033[0m"
NORMALIZED_TXT_FILE_SUFFIX = "_nfkc.txt"
NORMALIZED_CSV_FILE_SUFFIX = "_nfkc.csv"
OPEN_COMMAND = "open" # Abstract command for opening directories
class StringNormalizer:
"""
Handles reading, normalizing, and saving string content of a file.
"""
def __init__(self, input_file):
self.input_file = Path(input_file)
def get_mime_description(self):
result = subprocess.run(
['file', self.input_file], capture_output=True, text=True)
return result.stdout.strip()
def is_probably_csv(self):
return 'CSV' in self.get_mime_description()
def normalize_txt_string(self):
content = self.input_file.read_text()
return normalize(NORMALIZATION_FORM, content)
def normalize_csv_string(self):
with self.input_file.open(newline='') as csvfile:
return [
[normalize(NORMALIZATION_FORM, cell) for cell in row]
for row in csv.reader(csvfile)
]
def _save_file(self, content, filename, is_csv=False):
"""Helper to save content to a file and print status."""
filepath = self.input_file.with_name(filename)
if is_csv:
with filepath.open('w', newline='') as csvfile:
csv.writer(csvfile).writerows(content)
else:
filepath.write_text(content)
print(f"{COLOR_Y}Normalized file saved to {filepath}{COLOR_0}")
subprocess.run([OPEN_COMMAND, filepath.parent])
def save_normalized_csv_file(self):
filename = self.input_file.stem + NORMALIZED_CSV_FILE_SUFFIX
self._save_file(self.normalize_csv_string(), filename, is_csv=True)
def save_normalized_txt_file(self):
filename = self.input_file.stem + NORMALIZED_TXT_FILE_SUFFIX
self._save_file(self.normalize_txt_string(), filename)
def main(input_file):
normalizer = StringNormalizer(input_file)
if normalizer.is_probably_csv():
normalizer.save_normalized_csv_file()
else:
normalizer.save_normalized_txt_file()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog="normalize_file_nfkc",
description="Normalize input file content using NFKC form and save the result.")
parser.add_argument("input_file", help="Path to the input file for normalization.")
args = parser.parse_args()
main(args.input_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment