Last active
April 9, 2020 17:32
-
-
Save mamonu/221f426fb962016d55eb39dc523ea1bc to your computer and use it in GitHub Desktop.
brown corpus word count
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import string | |
# nltk.download('brown') | |
# if nltk hasnt been used before this will download the brown corpus | |
from nltk.corpus import brown | |
from collections import Counter | |
import pandas as pd | |
words = brown.words() | |
# all words to lowercase | |
lcwords = [str(word).lower() for word in words] | |
# get punctuation out | |
lcwords = [''.join(c for c in s if c not in string.punctuation) for s in lcwords] | |
wordcount=Counter(lcwords) | |
df = pd.DataFrame.from_dict(wordcount, orient='index').reset_index() | |
df = df.rename(columns={'index':'word', 0:'count'}) | |
df= df.sort_values(by=['count'],ascending=False) | |
print(df.head(40)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment