stephenhouser · August 18, 2022 01:47 · Saritalna · May 15, 2019
diff --git a/README.md b/README.md
diff --git a/keywords.txt b/keywords.txt
 lord
 god
 christ
 pray
 church
 bible
 religion
 brother
 sister
 mother
 father
 son
 daughter
 family
 freedmen
 freedman
 slave
 africa
 negro
 colored
 abolition
 anti-slavery
 peace
 fight
 battle
 conflict
 soldier
 rebel
 confederate

diff --git a/Python-word-counter b/Python-word-counter
 Simple Python Word Frequency Analyzer
diff --git a/word-counter.py b/word-counter.py
 #!/usr/bin/env python

 import sys
 import csv
 import string
 import nltk
 import re
 import argparse
 import logging

 ENCODING = 'mac_roman'
 CSV_DIALECT = 'excel'
 LOG_FORMAT = '%(message)s'

 stopwords = nltk.corpus.stopwords.words('english')
 stemmer = nltk.stem.snowball.SnowballStemmer('english')

 # Overall word frequencies (all words across all records)
 wordfreq_all = nltk.FreqDist()
 wordfreq_by_id = {}

 def count_words_in_file(file_handle, key_column_name, text_column_name):
    global wordfreq_all, wordfreq_by_id

    csreader = csv.reader(file_handle, dialect=CSV_DIALECT)
    columns = next(csreader)

    if text_column_name in columns:
        text_col = columns.index(text_column_name)
    else:
        text_col = int(text_column_name)

    if key_column_name in columns:
        key_col = columns.index(key_column_name)
    else:
        key_col = int(key_column_name)

    logging.info("Analyzing column #{}...".format(text_col))

    # Create a NLTK tokenizer that removes punctaution
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    for row in csreader:
        row_id = row[key_col]
        # Get content to evaluate, convert all to lower case
        body = row[text_col].lower()
        # Wack-a-doodle for Unicode...
        body = re.sub('[^0-9a-zA-Z]+', '*', body)

        # Convert content to word list (tokenize)
        tokens = tokenizer.tokenize(body)

        # Remove single-character tokens (mostly punctuation)
        tokens = [w for w in tokens if len(w) > 1]

        # Remove numbers
        tokens = [w for w in tokens if not w.isnumeric()]

        # Stemming words sometimes makes matters worse
        tokens = [stemmer.stem(w) for w in tokens]

        # Remove stop-words from list of tokens based on English stopwords
        tokens = [w for w in tokens if w not in stopwords]

        # Convert to NLTK Text object
        #wordfreq_by_id[row_id] = nltk.Text(tokens)

        # Generate word frequency list for this content
        wordfreq_by_id[row_id] = nltk.FreqDist(tokens)

        # Add to global freq dist of words
        for word in tokens:
            wordfreq_all[word] += 1

 def write_word_frequencies(file_handle, key_column_name, wordlist):
    cswriter = csv.writer(file_handle, dialect=CSV_DIALECT)

    # Write out header line
    header_row = [key_column_name] + wordlist
    cswriter.writerow(header_row)

    # Write out Grand Totals as first row
    cswriter.writerow(['Total use'] + [wordfreq_all[word] for word in wordlist])

    # Write out each row of word counts
    for (r_id, wordfreq) in wordfreq_by_id.items():
        row = [r_id] + [wordfreq[word] for word in wordlist]
        cswriter.writerow(row)

 def valid_int(int_str):
    try:
        int(int_str)
        return True
    except ValueError:
        return False

 def setup_logging(loglevel):
    # If `-l` is specified then use INFO as default
    #LOG_FORMAT = '%(levelname)s:%(message)s'
    log_level = 0 # Not set

    if loglevel:
        if valid_int(loglevel):
            log_level = int(loglevel)
        else:
            log_level = getattr(logging, loglevel.upper(), None)
            if not isinstance(log_level, int):
                raise ValueError("Invalid log level: {}".format(loglevel))
    else:
        log_level = 20 # INFO level

    logging.basicConfig(format=LOG_FORMAT, level=log_level)
    
 def main():
    aparser = argparse.ArgumentParser()
    aparser.add_argument("-l", "--log", nargs='?', 
        dest='loglevel',
        default='WARNING',
        help="show diagnostic messages (DEBUG, INFO, WARNING, ERROR, CRITICAL)")
    aparser.add_argument("-k", "--key", nargs=1, 
        dest='key_column', default=[0], 
        help="Name or index of primary key column")
    aparser.add_argument("-t", "--text", nargs=1, 
        dest='text_column', default=[1], 
        help="Name or index of text column to analyze")
    aparser.add_argument("-n", "--number", nargs=1, 
        type=int,
        default=[50],
        help="Number of word frequencies to output")
    aparser.add_argument("-w", "--words", nargs=1, 
        type=argparse.FileType('r', encoding=ENCODING), 
        help="Wordlist to compute frequencies for (one word per line)")
    aparser.add_argument("input", nargs='?',
        type=argparse.FileType('r', encoding=ENCODING), 
        default=sys.stdin,
        help="CSV input file(s)")
    aparser.add_argument("output", nargs='?', 
        type=argparse.FileType('w', encoding=ENCODING), 
        default=sys.stdout, 
        help="CSV output file")
    args = aparser.parse_args()

    setup_logging(args.loglevel)

    output_words = args.number[0]
    key_column = args.key_column[0]
    text_column = args.text_column[0]

    # Read in and count all input files...
    logging.info("Reading {}...".format(args.input.name))
    count_words_in_file(args.input, key_column, text_column)

    # Extract most frequently used words across all rows
    if args.words:
        logging.info("Using keywords for frequencies...")
        wordlist = args.words[0].read().replace('\n', ' ').split()
        wordlist = [stemmer.stem(w) for w in wordlist]
    else:
        logging.info("Computing most frequent words...")
        wordlist = [wc[0] for wc in wordfreq_all.most_common(output_words)]

    logging.info("Writing results to {}...".format(args.output.name))
    write_word_frequencies(args.output, key_column, wordlist)

 if __name__ == "__main__":
    main()
	lord
	god
	christ
	pray
	church
	bible
	religion
	brother
	sister
	mother
	father
	son
	daughter
	family
	freedmen
	freedman
	slave
	africa
	negro
	colored
	abolition
	anti-slavery
	peace
	fight
	battle
	conflict
	soldier
	rebel
	confederate
	#!/usr/bin/env python

	import sys
	import csv
	import string
	import nltk
	import re
	import argparse
	import logging

	ENCODING = 'mac_roman'
	CSV_DIALECT = 'excel'
	LOG_FORMAT = '%(message)s'

	stopwords = nltk.corpus.stopwords.words('english')
	stemmer = nltk.stem.snowball.SnowballStemmer('english')

	# Overall word frequencies (all words across all records)
	wordfreq_all = nltk.FreqDist()
	wordfreq_by_id = {}

	def count_words_in_file(file_handle, key_column_name, text_column_name):
	global wordfreq_all, wordfreq_by_id

	csreader = csv.reader(file_handle, dialect=CSV_DIALECT)
	columns = next(csreader)

	if text_column_name in columns:
	text_col = columns.index(text_column_name)
	else:
	text_col = int(text_column_name)

	if key_column_name in columns:
	key_col = columns.index(key_column_name)
	else:
	key_col = int(key_column_name)

	logging.info("Analyzing column #{}...".format(text_col))

	# Create a NLTK tokenizer that removes punctaution
	tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

	for row in csreader:
	row_id = row[key_col]
	# Get content to evaluate, convert all to lower case
	body = row[text_col].lower()
	# Wack-a-doodle for Unicode...
	body = re.sub('[^0-9a-zA-Z]+', '*', body)

	# Convert content to word list (tokenize)
	tokens = tokenizer.tokenize(body)

	# Remove single-character tokens (mostly punctuation)
	tokens = [w for w in tokens if len(w) > 1]

	# Remove numbers
	tokens = [w for w in tokens if not w.isnumeric()]

	# Stemming words sometimes makes matters worse
	tokens = [stemmer.stem(w) for w in tokens]

	# Remove stop-words from list of tokens based on English stopwords
	tokens = [w for w in tokens if w not in stopwords]

	# Convert to NLTK Text object
	#wordfreq_by_id[row_id] = nltk.Text(tokens)

	# Generate word frequency list for this content
	wordfreq_by_id[row_id] = nltk.FreqDist(tokens)

	# Add to global freq dist of words
	for word in tokens:
	wordfreq_all[word] += 1

	def write_word_frequencies(file_handle, key_column_name, wordlist):
	cswriter = csv.writer(file_handle, dialect=CSV_DIALECT)

	# Write out header line
	header_row = [key_column_name] + wordlist
	cswriter.writerow(header_row)

	# Write out Grand Totals as first row
	cswriter.writerow(['Total use'] + [wordfreq_all[word] for word in wordlist])

	# Write out each row of word counts
	for (r_id, wordfreq) in wordfreq_by_id.items():
	row = [r_id] + [wordfreq[word] for word in wordlist]
	cswriter.writerow(row)

	def valid_int(int_str):
	try:
	int(int_str)
	return True
	except ValueError:
	return False

	def setup_logging(loglevel):
	# If `-l` is specified then use INFO as default
	#LOG_FORMAT = '%(levelname)s:%(message)s'
	log_level = 0 # Not set

	if loglevel:
	if valid_int(loglevel):
	log_level = int(loglevel)
	else:
	log_level = getattr(logging, loglevel.upper(), None)
	if not isinstance(log_level, int):
	raise ValueError("Invalid log level: {}".format(loglevel))
	else:
	log_level = 20 # INFO level

	logging.basicConfig(format=LOG_FORMAT, level=log_level)

	def main():
	aparser = argparse.ArgumentParser()
	aparser.add_argument("-l", "--log", nargs='?',
	dest='loglevel',
	default='WARNING',
	help="show diagnostic messages (DEBUG, INFO, WARNING, ERROR, CRITICAL)")
	aparser.add_argument("-k", "--key", nargs=1,
	dest='key_column', default=[0],
	help="Name or index of primary key column")
	aparser.add_argument("-t", "--text", nargs=1,
	dest='text_column', default=[1],
	help="Name or index of text column to analyze")
	aparser.add_argument("-n", "--number", nargs=1,
	type=int,
	default=[50],
	help="Number of word frequencies to output")
	aparser.add_argument("-w", "--words", nargs=1,
	type=argparse.FileType('r', encoding=ENCODING),
	help="Wordlist to compute frequencies for (one word per line)")
	aparser.add_argument("input", nargs='?',
	type=argparse.FileType('r', encoding=ENCODING),
	default=sys.stdin,
	help="CSV input file(s)")
	aparser.add_argument("output", nargs='?',
	type=argparse.FileType('w', encoding=ENCODING),
	default=sys.stdout,
	help="CSV output file")
	args = aparser.parse_args()

	setup_logging(args.loglevel)

	output_words = args.number[0]
	key_column = args.key_column[0]
	text_column = args.text_column[0]

	# Read in and count all input files...
	logging.info("Reading {}...".format(args.input.name))
	count_words_in_file(args.input, key_column, text_column)

	# Extract most frequently used words across all rows
	if args.words:
	logging.info("Using keywords for frequencies...")
	wordlist = args.words[0].read().replace('\n', ' ').split()
	wordlist = [stemmer.stem(w) for w in wordlist]
	else:
	logging.info("Computing most frequent words...")
	wordlist = [wc[0] for wc in wordfreq_all.most_common(output_words)]

	logging.info("Writing results to {}...".format(args.output.name))
	write_word_frequencies(args.output, key_column, wordlist)

	if __name__ == "__main__":
	main()