Last active
December 11, 2022 11:42
-
-
Save tmalsburg/366e167f80f76dca5ea2e68d858ee845 to your computer and use it in GitHub Desktop.
Python script that uses NLTK to calculate the average length of English words across token and types in the Brown corpus
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
from statistics import mean, stdev, median, mode | |
nltk.download('brown') | |
tokens = nltk.corpus.brown.tagged_words(tagset="universal") | |
types = list(dict.fromkeys(tokens)) | |
# Lengths of tokens / types but ignoring punctuation, numbers, and X | |
# which is mostly foreign words (German, French, Latin) but strangely | |
# also a small number of common English words: | |
len_tokens = [len(w) for w,t in tokens if t not in ['.', 'NUM', 'X']] | |
len_types = [len(w) for w,t in types if t not in ['.', 'NUM', 'X']] | |
template = """{0}: | |
Mean: {1:.2f} {2:.2f} | |
Median: {3:.2f} | |
Mode: {4:.2f}""" | |
print(template.format("Tokens", *[f(len_tokens) for f in [mean, stdev, median, mode]])) | |
print(template.format("Types", *[f(len_types) for f in [mean, stdev, median, mode]])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output: