Created
April 2, 2024 11:39
-
-
Save bertsky/8c50d98a8ee8babdfc03ecc1da686c5a to your computer and use it in GitHub Desktop.
Aggregate character histogram for the given text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
import os | |
import sys | |
import io | |
from functools import reduce | |
import json | |
import unicodedata | |
# Command line arguments. | |
arg_parser = argparse.ArgumentParser(description='Aggregate character histogram for the given text files.') | |
arg_parser.add_argument("path", help="file or directory of text file(s)", nargs='*') | |
arg_parser.add_argument("-f", "--format", help="output format", choices=["json", "csv"], default="csv") | |
arg_parser.add_argument("-o", "--order", help="output order", choices=["char", "freq"], default="freq") | |
arg_parser.add_argument("-v", "--verbose", help="show Unicode codepoint names", action="store_true") | |
arg_parser.add_argument("-n", "--normalization", help="Unicode normalization form", choices=["", "NFC", "NFKC", "NFD", "NFKD"], default="") | |
args = arg_parser.parse_args() | |
def update(histogram, char): | |
if char in histogram: | |
histogram[char] += 1 | |
else: | |
histogram[char] = 1 | |
return histogram | |
def normalize(text): | |
if not args.normalization: | |
return text | |
return unicodedata.normalize(args.normalization, text) | |
# Read all files and overwrite them with normalized text if necessary. | |
histogram = {} | |
for path in args.path: | |
if os.path.isdir(path): | |
for filepath in os.listdir(path): | |
with io.open(os.path.join(path, filepath), "r", encoding="utf-8") as file: | |
text = file.read() | |
text = normalize(text) | |
histogram = reduce(update, text, histogram) | |
else: | |
with sys.stdin if path == '-' else io.open(path, "r", encoding="utf-8") as file: | |
text = file.read() | |
text = normalize(text) | |
histogram = reduce(update, text, histogram) | |
def sort(charfreq): | |
if args.order == 'char': | |
return charfreq[0] | |
if args.order == 'freq': | |
return charfreq[1] | |
histogram = dict(sorted(histogram.items(), key=sort)) | |
total = sum(freq for char, freq in histogram.items()) | |
if args.format == 'json': | |
print(json.dumps(histogram, indent=2)) | |
exit | |
print("char\tfreq#\tfreq%\tchar name\n") | |
for char, freq in histogram.items(): | |
if args.verbose: | |
try: | |
name = unicodedata.name(char) | |
except ValueError: | |
name = "unmapped Unicode char " + repr(char) | |
else: | |
name = "" | |
print(f"{char}\t{freq}\t{freq/total*100:2.4f}\t{name}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment