language-engineering’s gists

language-engineering / gist:3736660

Created September 17, 2012 10:48

	from nltk.tokenize import word_tokenize

	words = word_tokenize(sentence) #Split "sentence" into a list of tokens, then store that list of tokens in a variable called "words"

language-engineering / gist:3736685

Created September 17, 2012 10:57

	from nltk.text import Text

	text = Text(tokens) #create a new Text object, providing a list of tokens from a corpus

language-engineering / gist:3737680

Created September 17, 2012 14:28

	def example_function(item):
	return item + " - it is known"

	print example_function("The last dragon died long ago")

language-engineering / gist:3738154

Last active October 10, 2015 19:17

	import re #import regex module

	#Split the example string by whitespace alone
	print " What is the air-speed velocity of an unladen swallow? ".split()

	#split by punctuation and the contraction first by using the regex substitute
	#function to insert a space before punctuation. Then split by whitespace.
	#NOTE: \g<N> refers back to the Nth match inside the brackets in the first argument
	print re.sub("([.?!'])", " \g<1>", "You're using coconuts!").split()

language-engineering / gist:3742440

Created September 18, 2012 10:18

	from sussex_nltk.corpus_readers import RCV1CorpusReader #import the corpus reader

	rcv1cr = RCV1CorpusReader() #create a new Reuters corpus reader
	for document in rcv1cr.raw_documents(): #iterate over the corpus of documents
	print document #print each document, where each document is a single string of the raw text

language-engineering / gist:3742538

Created September 18, 2012 10:49

	from sussex_nltk.tokenize import twitter_tokenize,twitter_tokenize_batch #import CMU tokenize functions

	for sentence in sentences: #get each sentence from a list of sentences
	print twitter_tokenize(sentence) #print the tokenized version of the sentence

	for token in twitter_tokenize_batch(sentences): #batch tokenize a list of sentences
	print token #print each tokenized version of the sentences

language-engineering / gist:3743020

Created September 18, 2012 13:12

	tokens = ["The","cake","is","a","LIE"] #a list of tokens, some of which contain uppercase letters
	print [token.lower() for token in tokens] #print newly created list of all lowercase tokens

	numbers = ['in', 'the', 'year', '120', 'of', 'the', 'fourth', 'age', ',', 'after', '120', 'years', 'as', 'king', ',' , 'aragorn', 'died', 'at', 'the', 'age', 'of', '210']
	print ["NUM" if token.isdigit() else token for token in numbers] #replace all number tokens with "NUM" in a new list of tokens

language-engineering / gist:3760532

Created September 21, 2012 09:17

	import matplotlib.pyplot as pyplot

	def zipf_dist(freqdist,num_of_ranks=50,show_values=True):
	'''
	Given a frequency distribution object, rank all types
	in order of frequency of occurrence (where rank 1 is most
	frequent word), and plot the ranks against the frequency
	of occurrence. If num_of_ranks=20, then 20 types will
	be plotted.
	If show_values = True, then display the bar values above them.

language-engineering / gist:3761039

Created September 21, 2012 11:47

	from sussex_nltk.corpus_readers import TwitterCorpusReader #import the corpus reader

	tcr = TwitterCorpusReader() #create a new Twitter corpus reader

	#get a sample of tokens in the corpus using your 5-digit candidate number
	tokens = tcr.sample_words(12345)
	for token in tokens: #iterate over the tokens
	print token #print each token

language-engineering / gist:3762143

Last active October 10, 2015 22:38

	from nltk.corpus import stopwords

	stopwords = stopwords.words('english')
	filtered_tokens = [w for w in tokens if w.isalpha() and w not in stopwords]