tmalsburg · December 11, 2022 11:42 · tmalsburg · Dec 10, 2022
diff --git a/lengths_of_words_brown_corpus.py b/lengths_of_words_brown_corpus.py
 import nltk
 from statistics import mean, stdev, median, mode

 nltk.download('brown')
 tokens = nltk.corpus.brown.tagged_words(tagset="universal")
 types  = list(dict.fromkeys(tokens))

 # Lengths of tokens / types but ignoring punctuation, numbers, and X
 # which is mostly foreign words (German, French, Latin) but strangely
 # also a small number of common English words:
 len_tokens = [len(w) for w,t in tokens if t not in ['.', 'NUM', 'X']]
 len_types  = [len(w) for w,t in types  if t not in ['.', 'NUM', 'X']]

 template = """{0}:
  Mean:   {1:.2f} {2:.2f}
  Median: {3:.2f}
  Mode:   {4:.2f}"""

 print(template.format("Tokens", *[f(len_tokens) for f in [mean, stdev, median, mode]]))
 print(template.format("Types",  *[f(len_types)  for f in [mean, stdev, median, mode]]))
	import nltk
	from statistics import mean, stdev, median, mode

	nltk.download('brown')
	tokens = nltk.corpus.brown.tagged_words(tagset="universal")
	types = list(dict.fromkeys(tokens))

	# Lengths of tokens / types but ignoring punctuation, numbers, and X
	# which is mostly foreign words (German, French, Latin) but strangely
	# also a small number of common English words:
	len_tokens = [len(w) for w,t in tokens if t not in ['.', 'NUM', 'X']]
	len_types = [len(w) for w,t in types if t not in ['.', 'NUM', 'X']]

	template = """{0}:
	Mean: {1:.2f} {2:.2f}
	Median: {3:.2f}
	Mode: {4:.2f}"""

	print(template.format("Tokens", *[f(len_tokens) for f in [mean, stdev, median, mode]]))
	print(template.format("Types", *[f(len_types) for f in [mean, stdev, median, mode]]))