Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from numpy import average
from sussex_nltk.corpus_readers import ReutersCorpusReader
rcr = ReutersCorpusReader()
sample_size = 1000 #The number of sentences in a sample
#Randomly sample 1000 sentences, and build a list of the lengths of each sentence
sentence_lengths = [len(sentence) for sentence in rcr.sample_sents(sample_size)]
#Calculate and print the average sentence length
def test_iterate(number_of_iterations): #Function definition, takes one argument
for i in xrange(number_of_iterations): #The argument defines the number of times for which the loop will iterate.
print "Iteration %s" % i #Print which iteration you're on
test_iterate(10) #Call the function with an example iteration number "10"
#Import a corpus reader
from sussex_nltk.corpus_readers import ReutersCorpusReader
rcr = ReutersCorpusReader()
#Decide on the number of sentences that should be in your sample
sample_size = 1000
#See below, you have 2 different ways to get a random sample.
# 1. As a list of tokens, or
import os,sys
# add to python path so that the sussex_nltk package can be loaded
sys.path.append(os.path.join("path","to","LanguageEngineering"))
import sussex_nltk
# set the root of the sussex_nltk package so that the corpora can be loaded correctly
sussex_nltk._set_root(os.path.join("path","to","LanguageEngineering"))
from sussex_nltk.corpus_readers import TestCorpusReader #import the corpus reader
tcr = TestCorpusReader() #create a new corpus reader
tokens = tcr.words() #get the tokens of the corpus
for token in tokens: #iterate over the tokens
print token #print each token
def lexical_diversity(text):
return len(text) / (len(set(text)) + 0.0) #the addition of 0.0 ensures floating point division, incase you haven't executed: from __future__ import division
def hapax_count(freqdist):
return len(freqdist.hapaxes())
def vocabulary_size(freqdist):
return len(freqdist)
print "Lexical diversity: %s" % lexical_diversity(my_text)
from nltk.probability import FreqDist
from nltk import Text
#An example list of tokens, replace this list of tokens with one gained from each corpus sample
tokens = ["one","ring","to","rule","them","all"]
#First create a Text object from your sample of tokens
my_text = Text(tokens)
#Next create a FreqDist object from the newly created Text object
from numpy import average, std #import functions for obtaining average and standard deviation
#Store a list of example vocabulary sizes of 4 samples
stats = [0.4, 0.45, 0.41, 0.38]
#Print the average of the statistics
print "Average: %s" % average(stats)
#Print the standard deviation of the statistics
print "Standard deviation: %s" % std(stats)
from random import sample
sentences = corpus_reader.sents()
#Randomly sample 20 sentences from all the sentences in the corpus
random_sample = sample(sentences, 20)
from random import sample
def split_data_random(data, ratio=0.8):
'''
Split data into two lists. With ratio=0.8, the first list
will be 80% of the size of the original data, and the
second will be 20%. The items in each list will be
randomly assigned. Ideally "data" is a list.
'''
n = len(data)