Created
January 9, 2020 13:17
-
-
Save xdlg/f35a120224c0c3bb3c830cdeba5222c2 to your computer and use it in GitHub Desktop.
Simple script to count character frequencies in text files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import argparse | |
import string | |
from os.path import join | |
from glob import glob | |
from collections import Counter | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("directory", help="directory to scan") | |
parser.add_argument("extensions", help="file extensions, e.g. \"c,h\"") | |
parser.add_argument("-l", "--letters", help="count ASCII letters", | |
action="store_true") | |
parser.add_argument("-d", "--digits", help="count digits", | |
action="store_true") | |
parser.add_argument("-p", "--punctuation", help="count punctuation", | |
action="store_true") | |
parser.add_argument("-w", "--whitespace", help="count whitespace", | |
action="store_true") | |
args = parser.parse_args() | |
# Build the list of relevant file paths | |
file_paths = [] | |
for extension in args.extensions.split(','): | |
ext = "**/*." + extension.strip() | |
file_paths.extend(glob(join(args.directory, ext), recursive=True)) | |
# For each file, add the character counts to the total | |
counter = Counter() | |
for file_path in file_paths: | |
counter += Counter(open(file_path, 'r').read()) | |
chars_to_count = ((string.ascii_letters if args.letters else "") | |
+ (string.digits if args.digits else "") | |
+ (string.punctuation if args.punctuation else "") | |
+ (string.whitespace if args.whitespace else "")) | |
# Print results | |
frequencies = dict(counter) | |
for char in sorted(frequencies, key=frequencies.get, reverse=True): | |
if (char in chars_to_count): | |
print (f"{repr(char)} \t {frequencies[char]}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment