Last active
August 21, 2019 07:01
-
-
Save biranchi2018/6c131632dc0f1e0de3f234cf396aa3b9 to your computer and use it in GitHub Desktop.
NLP - Calculating Bigram Tokens
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.download('punkt') | |
import itertools | |
text = "today is 'Nayan's birthday. she loves ice cream. she is also fond of cream cake. we will celebrate her birthday with ice cream cake" | |
sentences = nltk.sent_tokenize(text) | |
words = [nltk.word_tokenize(sent) for sent in sentences] | |
print(words) | |
flattened_list = list(itertools.chain(*words)) | |
flattened_list | |
len(flattened_list) | |
# prints 28 | |
from nltk.util import ngrams | |
tokens = [token for token in flattened_list if token != ""] | |
output = list(ngrams(tokens, 2)) | |
output | |
''' | |
Output: | |
[('today', 'is'), | |
('is', "'Nayan"), | |
("'Nayan", "'s"), | |
("'s", 'birthday'), | |
('birthday', '.'), | |
('.', 'she'), | |
('she', 'loves'), | |
('loves', 'ice'), | |
('ice', 'cream'), | |
('cream', '.'), | |
('.', 'she'), | |
('she', 'is'), | |
('is', 'also'), | |
('also', 'fond'), | |
('fond', 'of'), | |
('of', 'cream'), | |
('cream', 'cake'), | |
('cake', '.'), | |
('.', 'we'), | |
('we', 'will'), | |
('will', 'celebrate'), | |
('celebrate', 'her'), | |
('her', 'birthday'), | |
('birthday', 'with'), | |
('with', 'ice'), | |
('ice', 'cream'), | |
('cream', 'cake')] | |
''' | |
len(output) | |
# prints 27 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment