This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.classify.api import ClassifierI | |
| import random | |
| class SimpleClassifier(ClassifierI): | |
| def __init__(self, pos, neg): | |
| self._pos = pos | |
| self._neg = neg | |
| def classify(self, words): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from nltk.probability import FreqDist | |
| from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
| #Helper function. Given a list of reviews, return a list of all the words in those reviews | |
| def get_all_words(amazon_reviews): | |
| return reduce(lambda words,review: words + review.words(), amazon_reviews, []) | |
| #A frequency distribution over all words in positive book reviews | |
| pos_book_freqdist = FreqDist(get_all_words(pos_training_data)) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| positive_words = ["splendid","resplendent","splendiferous"] | |
| negative_words = ["mediocre","paltry","inconsequential"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import nltk | |
| from sussex_nltk.corpus_readers import TwitterCorpusReader | |
| tcr = TwitterCorpusReader() | |
| tokens = tcr.sample_words_by_sents(25000) #get a sample of tokens | |
| fd = nltk.probability.FreqDist(tokens) #build a frequency distribution over tokens | |
| probability_distribution = nltk.probability.LidstoneProbDist(fd, 0.001) #build a probability distribution | |
| #Create a spell checker with new probability distribution | |
| s = SpellChecker(probability_distribution) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import gzip, os | |
| #Create an empty set ready to be filled with dictionary terms | |
| urban_dictionary = set() | |
| #Get a file pointer to the compressed file containing urban dictionary terms | |
| f = gzip.open(os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','UrbanDictionary','terms.gz')) | |
| #Fill set with urban dictionary entries | |
| for line in f: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, collections, nltk | |
| class SpellChecker(object): | |
| def __init__(self, probability_distribution=None): | |
| if probability_distribution: | |
| self.probabilities = probability_distribution | |
| else: | |
| #when working form home, the path below must be changed to reflect the location of the gutenberg data on your home machine | |
| gutenberg_spelling_training = os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','gutenberg','spelling.txt') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.corpus_readers import ReutersCorpusReader | |
| rcr = ReutersCorpusReader() #Create a new reader | |
| for sentence in rcr.sample_raw_sents(10): #get 10 random sentences, where each sentence is a string | |
| # do something with sentence |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.corpus_readers import ReutersCorpusReader | |
| from sussex_nltk.stats import expected_token_freq | |
| rcr = ReutersCorpusReader() | |
| sample_size = 1000 #The number of sentences in a sample | |
| #Randomly sample 1000 sentences, and get a list of the tokens in those sentences | |
| tokens = rcr.sample_words_by_sents(sample_size) | |
| #Calculate and print the expected token frequency for this one sample of tokens for the token "elephant" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import csv | |
| #Provide a list, where every element in the list corresponds to a row of the spreadsheet | |
| #Every element in the list is another list, whose elements correspond to the columns of that row | |
| data = [[2,3,3],[4,3,5],[2,1,4]] | |
| #Write the data to a CSV file, which a spreadsheet program can open | |
| with open("file_name.csv","wb") as outputfile: | |
| writer = csv.writer(outputfile) | |
| writer.writerows(data) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sussex_nltk.stats import expected_sentiment_tokens, normalised_lexical_diversity, prob_short_sents | |
| #Ensure that you correctly pass either a list of tokens, or a list of sentences (see comments below) | |
| #This function requires a list of tokens acquired from the "sample_words_by_sents" function on a corpus reader | |
| print "Expected number of sentiment tokens per 500 tokens: %s" % expected_sentiment_tokens(tokens) | |
| #This function requires a list of tokens acquired from the "sample_words_by_sents" function | |
| print "Normalised lexical diversity: %s" % normalised_lexical_diversity(tokens) |