bertsky · April 2, 2024 11:39
diff --git a/charfreq.py b/charfreq.py
 #!/usr/bin/env python3

 import argparse
 import os
 import sys
 import io
 from functools import reduce
 import json
 import unicodedata

 # Command line arguments.
 arg_parser = argparse.ArgumentParser(description='Aggregate character histogram for the given text files.')
 arg_parser.add_argument("path", help="file or directory of text file(s)", nargs='*')
 arg_parser.add_argument("-f", "--format", help="output format", choices=["json", "csv"], default="csv")
 arg_parser.add_argument("-o", "--order", help="output order", choices=["char", "freq"], default="freq")
 arg_parser.add_argument("-v", "--verbose", help="show Unicode codepoint names", action="store_true")
 arg_parser.add_argument("-n", "--normalization", help="Unicode normalization form", choices=["", "NFC", "NFKC", "NFD", "NFKD"], default="")

 args = arg_parser.parse_args()

 def update(histogram, char):
    if char in histogram:
        histogram[char] += 1
    else:
        histogram[char] = 1
    return histogram

 def normalize(text):
    if not args.normalization:
        return text
    return unicodedata.normalize(args.normalization, text)

 # Read all files and overwrite them with normalized text if necessary.
 histogram = {}
 for path in args.path:
    if os.path.isdir(path):
        for filepath in os.listdir(path):
            with io.open(os.path.join(path, filepath), "r", encoding="utf-8") as file:
                text = file.read()
            text = normalize(text)
            histogram = reduce(update, text, histogram)
    else:
        with sys.stdin if path == '-' else io.open(path, "r", encoding="utf-8") as file:
            text = file.read()
        text = normalize(text)
        histogram = reduce(update, text, histogram)

 def sort(charfreq):
    if args.order == 'char':
        return charfreq[0]
    if args.order == 'freq':
        return charfreq[1]

 histogram = dict(sorted(histogram.items(), key=sort))

 total = sum(freq for char, freq in histogram.items())

 if args.format == 'json':
    print(json.dumps(histogram, indent=2))
    exit
 print("char\tfreq#\tfreq%\tchar name\n")
 for char, freq in histogram.items():
    if args.verbose:
        try:
            name = unicodedata.name(char)
        except ValueError:
            name = "unmapped Unicode char " + repr(char)
    else:
        name = ""
    print(f"{char}\t{freq}\t{freq/total*100:2.4f}\t{name}")
	#!/usr/bin/env python3

	import argparse
	import os
	import sys
	import io
	from functools import reduce
	import json
	import unicodedata

	# Command line arguments.
	arg_parser = argparse.ArgumentParser(description='Aggregate character histogram for the given text files.')
	arg_parser.add_argument("path", help="file or directory of text file(s)", nargs='*')
	arg_parser.add_argument("-f", "--format", help="output format", choices=["json", "csv"], default="csv")
	arg_parser.add_argument("-o", "--order", help="output order", choices=["char", "freq"], default="freq")
	arg_parser.add_argument("-v", "--verbose", help="show Unicode codepoint names", action="store_true")
	arg_parser.add_argument("-n", "--normalization", help="Unicode normalization form", choices=["", "NFC", "NFKC", "NFD", "NFKD"], default="")

	args = arg_parser.parse_args()

	def update(histogram, char):
	if char in histogram:
	histogram[char] += 1
	else:
	histogram[char] = 1
	return histogram

	def normalize(text):
	if not args.normalization:
	return text
	return unicodedata.normalize(args.normalization, text)

	# Read all files and overwrite them with normalized text if necessary.
	histogram = {}
	for path in args.path:
	if os.path.isdir(path):
	for filepath in os.listdir(path):
	with io.open(os.path.join(path, filepath), "r", encoding="utf-8") as file:
	text = file.read()
	text = normalize(text)
	histogram = reduce(update, text, histogram)
	else:
	with sys.stdin if path == '-' else io.open(path, "r", encoding="utf-8") as file:
	text = file.read()
	text = normalize(text)
	histogram = reduce(update, text, histogram)

	def sort(charfreq):
	if args.order == 'char':
	return charfreq[0]
	if args.order == 'freq':
	return charfreq[1]

	histogram = dict(sorted(histogram.items(), key=sort))

	total = sum(freq for char, freq in histogram.items())

	if args.format == 'json':
	print(json.dumps(histogram, indent=2))
	exit
	print("char\tfreq#\tfreq%\tchar name\n")
	for char, freq in histogram.items():
	if args.verbose:
	try:
	name = unicodedata.name(char)
	except ValueError:
	name = "unmapped Unicode char " + repr(char)
	else:
	name = ""
	print(f"{char}\t{freq}\t{freq/total*100:2.4f}\t{name}")
No results found