Created
October 15, 2019 08:46
-
-
Save samidarko/0985fe709e9113021fed8b7416c49518 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
twitt = """What's a little DUI on the way to gobble fries with a "friend"? And that playful punch? Total misunderstanding!!""" | |
from nltk.tokenize import TweetTokenizer | |
tknzr = TweetTokenizer() | |
tknzr.tokenize(twitt) | |
tokens = tknzr.tokenize(twitt) | |
from nltk.util import everygrams | |
everygrams(tknzr.tokenize(twitt)) | |
list(everygrams(tknzr.tokenize(twitt))) | |
tokens | |
list(set(tokens)) | |
vocab.update(list(set(tokens))) | |
vocab | |
vocab.lookup(['to']) | |
vocab.lookup(['to', 'a']) | |
text = [everygrams(tknzr.tokenize(twitt))] | |
counts.update(vocab.lookup(sent) for sent in text) | |
counts | |
counts.unigrams | |
lm = MLE(3) | |
lm.counts = counts | |
lm.vocab = vocab |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment