This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from numpy import average | |
| from sussex_nltk.corpus_readers import ReutersCorpusReader | |
| rcr = ReutersCorpusReader() | |
| sample_size = 1000 #The number of sentences in a sample | |
| #Randomly sample 1000 sentences, and build a list of the lengths of each sentence | |
| sentence_lengths = [len(sentence) for sentence in rcr.sample_sents(sample_size)] | |
| #Calculate and print the average sentence length |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def test_iterate(number_of_iterations): #Function definition, takes one argument | |
| for i in xrange(number_of_iterations): #The argument defines the number of times for which the loop will iterate. | |
| print "Iteration %s" % i #Print which iteration you're on | |
| test_iterate(10) #Call the function with an example iteration number "10" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #Import a corpus reader | |
| from sussex_nltk.corpus_readers import ReutersCorpusReader | |
| rcr = ReutersCorpusReader() | |
| #Decide on the number of sentences that should be in your sample | |
| sample_size = 1000 | |
| #See below, you have 2 different ways to get a random sample. | |
| # 1. As a list of tokens, or |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os,sys | |
| # add to python path so that the sussex_nltk package can be loaded | |
| sys.path.append(os.path.join("path","to","LanguageEngineering")) | |
| import sussex_nltk | |
| # set the root of the sussex_nltk package so that the corpora can be loaded correctly | |
| sussex_nltk._set_root(os.path.join("path","to","LanguageEngineering")) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.corpus_readers import TestCorpusReader #import the corpus reader | |
| tcr = TestCorpusReader() #create a new corpus reader | |
| tokens = tcr.words() #get the tokens of the corpus | |
| for token in tokens: #iterate over the tokens | |
| print token #print each token |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def lexical_diversity(text): | |
| return len(text) / (len(set(text)) + 0.0) #the addition of 0.0 ensures floating point division, incase you haven't executed: from __future__ import division | |
| def hapax_count(freqdist): | |
| return len(freqdist.hapaxes()) | |
| def vocabulary_size(freqdist): | |
| return len(freqdist) | |
| print "Lexical diversity: %s" % lexical_diversity(my_text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.probability import FreqDist | |
| from nltk import Text | |
| #An example list of tokens, replace this list of tokens with one gained from each corpus sample | |
| tokens = ["one","ring","to","rule","them","all"] | |
| #First create a Text object from your sample of tokens | |
| my_text = Text(tokens) | |
| #Next create a FreqDist object from the newly created Text object |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from numpy import average, std #import functions for obtaining average and standard deviation | |
| #Store a list of example vocabulary sizes of 4 samples | |
| stats = [0.4, 0.45, 0.41, 0.38] | |
| #Print the average of the statistics | |
| print "Average: %s" % average(stats) | |
| #Print the standard deviation of the statistics | |
| print "Standard deviation: %s" % std(stats) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from random import sample | |
| sentences = corpus_reader.sents() | |
| #Randomly sample 20 sentences from all the sentences in the corpus | |
| random_sample = sample(sentences, 20) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from random import sample | |
| def split_data_random(data, ratio=0.8): | |
| ''' | |
| Split data into two lists. With ratio=0.8, the first list | |
| will be 80% of the size of the original data, and the | |
| second will be 20%. The items in each list will be | |
| randomly assigned. Ideally "data" is a list. | |
| ''' | |
| n = len(data) |