|
#!/usr/bin/env python |
|
|
|
import sys |
|
import csv |
|
import string |
|
import nltk |
|
import re |
|
import argparse |
|
import logging |
|
|
|
ENCODING = 'mac_roman' |
|
CSV_DIALECT = 'excel' |
|
LOG_FORMAT = '%(message)s' |
|
|
|
stopwords = nltk.corpus.stopwords.words('english') |
|
stemmer = nltk.stem.snowball.SnowballStemmer('english') |
|
|
|
# Overall word frequencies (all words across all records) |
|
wordfreq_all = nltk.FreqDist() |
|
wordfreq_by_id = {} |
|
|
|
def count_words_in_file(file_handle, key_column_name, text_column_name): |
|
global wordfreq_all, wordfreq_by_id |
|
|
|
csreader = csv.reader(file_handle, dialect=CSV_DIALECT) |
|
columns = next(csreader) |
|
|
|
if text_column_name in columns: |
|
text_col = columns.index(text_column_name) |
|
else: |
|
text_col = int(text_column_name) |
|
|
|
if key_column_name in columns: |
|
key_col = columns.index(key_column_name) |
|
else: |
|
key_col = int(key_column_name) |
|
|
|
logging.info("Analyzing column #{}...".format(text_col)) |
|
|
|
# Create a NLTK tokenizer that removes punctaution |
|
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') |
|
|
|
for row in csreader: |
|
row_id = row[key_col] |
|
# Get content to evaluate, convert all to lower case |
|
body = row[text_col].lower() |
|
# Wack-a-doodle for Unicode... |
|
body = re.sub('[^0-9a-zA-Z]+', '*', body) |
|
|
|
# Convert content to word list (tokenize) |
|
tokens = tokenizer.tokenize(body) |
|
|
|
# Remove single-character tokens (mostly punctuation) |
|
tokens = [w for w in tokens if len(w) > 1] |
|
|
|
# Remove numbers |
|
tokens = [w for w in tokens if not w.isnumeric()] |
|
|
|
# Stemming words sometimes makes matters worse |
|
tokens = [stemmer.stem(w) for w in tokens] |
|
|
|
# Remove stop-words from list of tokens based on English stopwords |
|
tokens = [w for w in tokens if w not in stopwords] |
|
|
|
# Convert to NLTK Text object |
|
#wordfreq_by_id[row_id] = nltk.Text(tokens) |
|
|
|
# Generate word frequency list for this content |
|
wordfreq_by_id[row_id] = nltk.FreqDist(tokens) |
|
|
|
# Add to global freq dist of words |
|
for word in tokens: |
|
wordfreq_all[word] += 1 |
|
|
|
def write_word_frequencies(file_handle, key_column_name, wordlist): |
|
cswriter = csv.writer(file_handle, dialect=CSV_DIALECT) |
|
|
|
# Write out header line |
|
header_row = [key_column_name] + wordlist |
|
cswriter.writerow(header_row) |
|
|
|
# Write out Grand Totals as first row |
|
cswriter.writerow(['Total use'] + [wordfreq_all[word] for word in wordlist]) |
|
|
|
# Write out each row of word counts |
|
for (r_id, wordfreq) in wordfreq_by_id.items(): |
|
row = [r_id] + [wordfreq[word] for word in wordlist] |
|
cswriter.writerow(row) |
|
|
|
def valid_int(int_str): |
|
try: |
|
int(int_str) |
|
return True |
|
except ValueError: |
|
return False |
|
|
|
def setup_logging(loglevel): |
|
# If `-l` is specified then use INFO as default |
|
#LOG_FORMAT = '%(levelname)s:%(message)s' |
|
log_level = 0 # Not set |
|
|
|
if loglevel: |
|
if valid_int(loglevel): |
|
log_level = int(loglevel) |
|
else: |
|
log_level = getattr(logging, loglevel.upper(), None) |
|
if not isinstance(log_level, int): |
|
raise ValueError("Invalid log level: {}".format(loglevel)) |
|
else: |
|
log_level = 20 # INFO level |
|
|
|
logging.basicConfig(format=LOG_FORMAT, level=log_level) |
|
|
|
def main(): |
|
aparser = argparse.ArgumentParser() |
|
aparser.add_argument("-l", "--log", nargs='?', |
|
dest='loglevel', |
|
default='WARNING', |
|
help="show diagnostic messages (DEBUG, INFO, WARNING, ERROR, CRITICAL)") |
|
aparser.add_argument("-k", "--key", nargs=1, |
|
dest='key_column', default=[0], |
|
help="Name or index of primary key column") |
|
aparser.add_argument("-t", "--text", nargs=1, |
|
dest='text_column', default=[1], |
|
help="Name or index of text column to analyze") |
|
aparser.add_argument("-n", "--number", nargs=1, |
|
type=int, |
|
default=[50], |
|
help="Number of word frequencies to output") |
|
aparser.add_argument("-w", "--words", nargs=1, |
|
type=argparse.FileType('r', encoding=ENCODING), |
|
help="Wordlist to compute frequencies for (one word per line)") |
|
aparser.add_argument("input", nargs='?', |
|
type=argparse.FileType('r', encoding=ENCODING), |
|
default=sys.stdin, |
|
help="CSV input file(s)") |
|
aparser.add_argument("output", nargs='?', |
|
type=argparse.FileType('w', encoding=ENCODING), |
|
default=sys.stdout, |
|
help="CSV output file") |
|
args = aparser.parse_args() |
|
|
|
setup_logging(args.loglevel) |
|
|
|
output_words = args.number[0] |
|
key_column = args.key_column[0] |
|
text_column = args.text_column[0] |
|
|
|
# Read in and count all input files... |
|
logging.info("Reading {}...".format(args.input.name)) |
|
count_words_in_file(args.input, key_column, text_column) |
|
|
|
# Extract most frequently used words across all rows |
|
if args.words: |
|
logging.info("Using keywords for frequencies...") |
|
wordlist = args.words[0].read().replace('\n', ' ').split() |
|
wordlist = [stemmer.stem(w) for w in wordlist] |
|
else: |
|
logging.info("Computing most frequent words...") |
|
wordlist = [wc[0] for wc in wordfreq_all.most_common(output_words)] |
|
|
|
logging.info("Writing results to {}...".format(args.output.name)) |
|
write_word_frequencies(args.output, key_column, wordlist) |
|
|
|
if __name__ == "__main__": |
|
main() |
Hello, when I run your code I get this error: TypeError: argument of type 'NoneType' is not iterable - because of: if text_column_name in columns: text_col = columns.index()
I wonder if I need to define the 'key/text_column:name' in def count_words_in_file? from the start? I have though in the defmain():
output_words = args.number[0]
key_column = args.category_id[0]
text_column = args.document[0].
thanks,
Sarita