Last active
May 24, 2019 10:58
-
-
Save youfou/8c36923a972727049991a0a9a88857f6 to your computer and use it in GitHub Desktop.
a script for statistic word frequency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# coding: utf-8 | |
import os | |
import logging | |
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] %(message)s') | |
__title__ = 'word-freq' | |
__version__ = '0.1.0' | |
__author__ = 'Youfou' | |
def get_arg_parser(): | |
import argparse | |
ap = argparse.ArgumentParser( | |
description='Count word frequencies in text files.') | |
ap.add_argument( | |
'input_path', type=str, nargs='*', | |
help='files or dirs to scan (required)') | |
ap.add_argument( | |
'-r', '--recur', action='store_true', default=False, | |
help='recur files in sub folders (default: parent folder only)' | |
) | |
ap.add_argument( | |
'-m', '--min_chars', type=int, default=2, | |
help='specify min characters as a word (default: 2)' | |
) | |
ap.add_argument( | |
'-o', '--output', type=str, default='.', metavar='dir', | |
help='specify which dir to save results (default: current working dir)' | |
) | |
ap.add_argument( | |
'-v', '--version', action='store_true', | |
help='show version and exit') | |
return ap | |
def load_content(path, recur=False): | |
paths = list() | |
if os.path.isfile(path): | |
paths.append(path) | |
elif os.path.isdir(path): | |
for root, dirs, files in os.walk(path): | |
for name in files: | |
if name.endswith('.txt') or name.endswith('.text'): | |
paths.append(os.path.join(root, name)) | |
if not recur: | |
break | |
texts = list() | |
for _path in paths: | |
logging.info('Loading: {}'.format(_path)) | |
with open(_path, errors='replace') as fp: | |
texts.append(fp.read()) | |
return '\n'.join(texts) | |
def count_word_freq(text, min_chars=2): | |
import re | |
from collections import Counter | |
counter = Counter() | |
for match in re.finditer(r'(?:\w|[A-Z]\.){' + str(min_chars) + r',}', text): | |
word = match.group() | |
if re.search(r'[a-z]', word, re.I): | |
counter.update([word.lower()]) | |
return counter | |
def save(data, path): | |
from openpyxl import Workbook | |
wb = Workbook() | |
ws = wb.active | |
for row in data.most_common(): | |
ws.append(row) | |
wb.save(path) | |
if __name__ == '__main__': | |
arg_parser = get_arg_parser() | |
args = arg_parser.parse_args() | |
if args.input_path: | |
if not os.path.isdir(args.output): | |
os.makedirs(args.output) | |
logging.info('Created: {}'.format(args.output)) | |
for input_path in args.input_path: | |
content = load_content(input_path, args.recur) | |
logging.info('Counting: {}'.format(input_path)) | |
freq = count_word_freq(content, args.min_chars) | |
output_path = '{}.xlsx'.format(os.path.join( | |
args.output, | |
os.path.splitext(os.path.split(input_path)[1])[0] | |
)) | |
logging.info('Saving as: {}'.format(output_path)) | |
save(freq, output_path) | |
elif args.version: | |
print('{} {} by {}'.format(__title__, __version__, __author__)) | |
else: | |
arg_parser.print_help() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment