Last active
April 14, 2025 07:05
-
-
Save yeiichi/f10931442f0ac657fc3ef2bcebddfbc8 to your computer and use it in GitHub Desktop.
Normalizes the textual content of plain text or CSV files into NFKC format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Normalizes the textual content of text or CSV files into NFKC format.""" | |
import argparse | |
import csv | |
import subprocess | |
from pathlib import Path | |
from unicodedata import normalize | |
# Constants | |
NORMALIZATION_FORM = "NFKC" | |
COLOR_Y = "\033[93m" | |
COLOR_0 = "\033[0m" | |
NORMALIZED_TXT_FILE_SUFFIX = "_nfkc.txt" | |
NORMALIZED_CSV_FILE_SUFFIX = "_nfkc.csv" | |
OPEN_COMMAND = "open" # Abstract command for opening directories | |
class StringNormalizer: | |
""" | |
Handles reading, normalizing, and saving string content of a file. | |
""" | |
def __init__(self, input_file): | |
self.input_file = Path(input_file) | |
def get_mime_description(self): | |
result = subprocess.run( | |
['file', self.input_file], capture_output=True, text=True) | |
return result.stdout.strip() | |
def is_probably_csv(self): | |
return 'CSV' in self.get_mime_description() | |
def normalize_txt_string(self): | |
content = self.input_file.read_text() | |
return normalize(NORMALIZATION_FORM, content) | |
def normalize_csv_string(self): | |
with self.input_file.open(newline='') as csvfile: | |
return [ | |
[normalize(NORMALIZATION_FORM, cell) for cell in row] | |
for row in csv.reader(csvfile) | |
] | |
def _save_file(self, content, filename, is_csv=False): | |
"""Helper to save content to a file and print status.""" | |
filepath = self.input_file.with_name(filename) | |
if is_csv: | |
with filepath.open('w', newline='') as csvfile: | |
csv.writer(csvfile).writerows(content) | |
else: | |
filepath.write_text(content) | |
print(f"{COLOR_Y}Normalized file saved to {filepath}{COLOR_0}") | |
subprocess.run([OPEN_COMMAND, filepath.parent]) | |
def save_normalized_csv_file(self): | |
filename = self.input_file.stem + NORMALIZED_CSV_FILE_SUFFIX | |
self._save_file(self.normalize_csv_string(), filename, is_csv=True) | |
def save_normalized_txt_file(self): | |
filename = self.input_file.stem + NORMALIZED_TXT_FILE_SUFFIX | |
self._save_file(self.normalize_txt_string(), filename) | |
def main(input_file): | |
normalizer = StringNormalizer(input_file) | |
if normalizer.is_probably_csv(): | |
normalizer.save_normalized_csv_file() | |
else: | |
normalizer.save_normalized_txt_file() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
prog="normalize_file_nfkc", | |
description="Normalize input file content using NFKC form and save the result.") | |
parser.add_argument("input_file", help="Path to the input file for normalization.") | |
args = parser.parse_args() | |
main(args.input_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment