Last active
May 20, 2021 09:14
-
-
Save CristhianBoujon/c719ba2287a630a6d3821d37a9608ac8 to your computer and use it in GitHub Desktop.
List the words in a vocabulary according to occurrence in a text corpus , Scikit-Learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_top_n_words(corpus, n=None): | |
""" | |
List the top n words in a vocabulary according to occurrence in a text corpus. | |
get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> | |
[('python', 2), | |
('world', 2), | |
('love', 2), | |
('hello', 1), | |
('is', 1), | |
('programming', 1), | |
('the', 1), | |
('language', 1)] | |
""" | |
vec = CountVectorizer().fit(corpus) | |
bag_of_words = vec.transform(corpus) | |
sum_words = bag_of_words.sum(axis=0) | |
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] | |
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) | |
return words_freq[:n] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment