language-engineering’s gists

language-engineering / gist:3826385

Created October 3, 2012 10:54

	from numpy import average
	from sussex_nltk.corpus_readers import ReutersCorpusReader

	rcr = ReutersCorpusReader()
	sample_size = 1000 #The number of sentences in a sample

	#Randomly sample 1000 sentences, and build a list of the lengths of each sentence
	sentence_lengths = [len(sentence) for sentence in rcr.sample_sents(sample_size)]

	#Calculate and print the average sentence length

language-engineering / gist:3818251

Created October 2, 2012 11:14

	def test_iterate(number_of_iterations): #Function definition, takes one argument
	for i in xrange(number_of_iterations): #The argument defines the number of times for which the loop will iterate.
	print "Iteration %s" % i #Print which iteration you're on

	test_iterate(10) #Call the function with an example iteration number "10"

language-engineering / gist:3812781

Created October 1, 2012 16:16

	#Import a corpus reader
	from sussex_nltk.corpus_readers import ReutersCorpusReader

	rcr = ReutersCorpusReader()

	#Decide on the number of sentences that should be in your sample
	sample_size = 1000

	#See below, you have 2 different ways to get a random sample.
	# 1. As a list of tokens, or

language-engineering / gist:3793841

Created September 27, 2012 12:52

	import os,sys
	# add to python path so that the sussex_nltk package can be loaded
	sys.path.append(os.path.join("path","to","LanguageEngineering"))
	import sussex_nltk

	# set the root of the sussex_nltk package so that the corpora can be loaded correctly
	sussex_nltk._set_root(os.path.join("path","to","LanguageEngineering"))

language-engineering / gist:3787907

Created September 26, 2012 13:03

	from sussex_nltk.corpus_readers import TestCorpusReader #import the corpus reader

	tcr = TestCorpusReader() #create a new corpus reader
	tokens = tcr.words() #get the tokens of the corpus
	for token in tokens: #iterate over the tokens
	print token #print each token

language-engineering / gist:3787384

Created September 26, 2012 11:08

	def lexical_diversity(text):
	return len(text) / (len(set(text)) + 0.0) #the addition of 0.0 ensures floating point division, incase you haven't executed: from __future__ import division

	def hapax_count(freqdist):
	return len(freqdist.hapaxes())

	def vocabulary_size(freqdist):
	return len(freqdist)

	print "Lexical diversity: %s" % lexical_diversity(my_text)

language-engineering / gist:3787189

Created September 26, 2012 10:17

	from nltk.probability import FreqDist
	from nltk import Text

	#An example list of tokens, replace this list of tokens with one gained from each corpus sample
	tokens = ["one","ring","to","rule","them","all"]

	#First create a Text object from your sample of tokens
	my_text = Text(tokens)

	#Next create a FreqDist object from the newly created Text object

language-engineering / gist:3786961

Created September 26, 2012 09:16

	from numpy import average, std #import functions for obtaining average and standard deviation

	#Store a list of example vocabulary sizes of 4 samples
	stats = [0.4, 0.45, 0.41, 0.38]

	#Print the average of the statistics
	print "Average: %s" % average(stats)

	#Print the standard deviation of the statistics
	print "Standard deviation: %s" % std(stats)

language-engineering / gist:3780987

Created September 25, 2012 09:57

	from random import sample

	sentences = corpus_reader.sents()

	#Randomly sample 20 sentences from all the sentences in the corpus
	random_sample = sample(sentences, 20)

language-engineering / gist:3780915

Created September 25, 2012 09:42

	from random import sample

	def split_data_random(data, ratio=0.8):
	'''
	Split data into two lists. With ratio=0.8, the first list
	will be 80% of the size of the original data, and the
	second will be 20%. The items in each list will be
	randomly assigned. Ideally "data" is a list.
	'''
	n = len(data)