Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from nltk.tokenize import word_tokenize
words = word_tokenize(sentence) #Split "sentence" into a list of tokens, then store that list of tokens in a variable called "words"
from nltk.text import Text
text = Text(tokens) #create a new Text object, providing a list of tokens from a corpus
def example_function(item):
return item + " - it is known"
print example_function("The last dragon died long ago")
import re #import regex module
#Split the example string by whitespace alone
print " What is the air-speed velocity of an unladen swallow? ".split()
#split by punctuation and the contraction first by using the regex substitute
#function to insert a space before punctuation. Then split by whitespace.
#NOTE: \g<N> refers back to the Nth match inside the brackets in the first argument
print re.sub("([.?!'])", " \g<1>", "You're using coconuts!").split()
from sussex_nltk.corpus_readers import RCV1CorpusReader #import the corpus reader
rcv1cr = RCV1CorpusReader() #create a new Reuters corpus reader
for document in rcv1cr.raw_documents(): #iterate over the corpus of documents
print document #print each document, where each document is a single string of the raw text
from sussex_nltk.tokenize import twitter_tokenize,twitter_tokenize_batch #import CMU tokenize functions
for sentence in sentences: #get each sentence from a list of sentences
print twitter_tokenize(sentence) #print the tokenized version of the sentence
for token in twitter_tokenize_batch(sentences): #batch tokenize a list of sentences
print token #print each tokenized version of the sentences
tokens = ["The","cake","is","a","LIE"] #a list of tokens, some of which contain uppercase letters
print [token.lower() for token in tokens] #print newly created list of all lowercase tokens
numbers = ['in', 'the', 'year', '120', 'of', 'the', 'fourth', 'age', ',', 'after', '120', 'years', 'as', 'king', ',' , 'aragorn', 'died', 'at', 'the', 'age', 'of', '210']
print ["NUM" if token.isdigit() else token for token in numbers] #replace all number tokens with "NUM" in a new list of tokens
import matplotlib.pyplot as pyplot
def zipf_dist(freqdist,num_of_ranks=50,show_values=True):
'''
Given a frequency distribution object, rank all types
in order of frequency of occurrence (where rank 1 is most
frequent word), and plot the ranks against the frequency
of occurrence. If num_of_ranks=20, then 20 types will
be plotted.
If show_values = True, then display the bar values above them.
from sussex_nltk.corpus_readers import TwitterCorpusReader #import the corpus reader
tcr = TwitterCorpusReader() #create a new Twitter corpus reader
#get a sample of tokens in the corpus using your 5-digit candidate number
tokens = tcr.sample_words(12345)
for token in tokens: #iterate over the tokens
print token #print each token
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
filtered_tokens = [w for w in tokens if w.isalpha() and w not in stopwords]