Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from nltk.classify.api import ClassifierI
import random
class SimpleClassifier(ClassifierI):
def __init__(self, pos, neg):
self._pos = pos
self._neg = neg
def classify(self, words):
from nltk.probability import FreqDist
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
#Helper function. Given a list of reviews, return a list of all the words in those reviews
def get_all_words(amazon_reviews):
return reduce(lambda words,review: words + review.words(), amazon_reviews, [])
#A frequency distribution over all words in positive book reviews
pos_book_freqdist = FreqDist(get_all_words(pos_training_data))
positive_words = ["splendid","resplendent","splendiferous"]
negative_words = ["mediocre","paltry","inconsequential"]
import nltk
from sussex_nltk.corpus_readers import TwitterCorpusReader
tcr = TwitterCorpusReader()
tokens = tcr.sample_words_by_sents(25000) #get a sample of tokens
fd = nltk.probability.FreqDist(tokens) #build a frequency distribution over tokens
probability_distribution = nltk.probability.LidstoneProbDist(fd, 0.001) #build a probability distribution
#Create a spell checker with new probability distribution
s = SpellChecker(probability_distribution)
import gzip, os
#Create an empty set ready to be filled with dictionary terms
urban_dictionary = set()
#Get a file pointer to the compressed file containing urban dictionary terms
f = gzip.open(os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','UrbanDictionary','terms.gz'))
#Fill set with urban dictionary entries
for line in f:
import os, collections, nltk
class SpellChecker(object):
def __init__(self, probability_distribution=None):
if probability_distribution:
self.probabilities = probability_distribution
else:
#when working form home, the path below must be changed to reflect the location of the gutenberg data on your home machine
gutenberg_spelling_training = os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','gutenberg','spelling.txt')
from sussex_nltk.corpus_readers import ReutersCorpusReader
rcr = ReutersCorpusReader() #Create a new reader
for sentence in rcr.sample_raw_sents(10): #get 10 random sentences, where each sentence is a string
# do something with sentence
from sussex_nltk.corpus_readers import ReutersCorpusReader
from sussex_nltk.stats import expected_token_freq
rcr = ReutersCorpusReader()
sample_size = 1000 #The number of sentences in a sample
#Randomly sample 1000 sentences, and get a list of the tokens in those sentences
tokens = rcr.sample_words_by_sents(sample_size)
#Calculate and print the expected token frequency for this one sample of tokens for the token "elephant"
import csv
#Provide a list, where every element in the list corresponds to a row of the spreadsheet
#Every element in the list is another list, whose elements correspond to the columns of that row
data = [[2,3,3],[4,3,5],[2,1,4]]
#Write the data to a CSV file, which a spreadsheet program can open
with open("file_name.csv","wb") as outputfile:
writer = csv.writer(outputfile)
writer.writerows(data)
from sussex_nltk.stats import expected_sentiment_tokens, normalised_lexical_diversity, prob_short_sents
#Ensure that you correctly pass either a list of tokens, or a list of sentences (see comments below)
#This function requires a list of tokens acquired from the "sample_words_by_sents" function on a corpus reader
print "Expected number of sentiment tokens per 500 tokens: %s" % expected_sentiment_tokens(tokens)
#This function requires a list of tokens acquired from the "sample_words_by_sents" function
print "Normalised lexical diversity: %s" % normalised_lexical_diversity(tokens)