language-engineering’s gists

language-engineering / gist:3865389

Last active October 11, 2015 13:27

	from nltk.classify.api import ClassifierI
	import random

	class SimpleClassifier(ClassifierI):

	def __init__(self, pos, neg):
	self._pos = pos
	self._neg = neg

	def classify(self, words):

language-engineering / gist:3865355

Last active October 11, 2015 13:27

	from nltk.probability import FreqDist
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	#Helper function. Given a list of reviews, return a list of all the words in those reviews
	def get_all_words(amazon_reviews):
	return reduce(lambda words,review: words + review.words(), amazon_reviews, [])

	#A frequency distribution over all words in positive book reviews
	pos_book_freqdist = FreqDist(get_all_words(pos_training_data))

language-engineering / gist:3865303

Created October 10, 2012 12:27

	positive_words = ["splendid","resplendent","splendiferous"]
	negative_words = ["mediocre","paltry","inconsequential"]

language-engineering / gist:3859317

Created October 9, 2012 14:52

	import nltk
	from sussex_nltk.corpus_readers import TwitterCorpusReader

	tcr = TwitterCorpusReader()
	tokens = tcr.sample_words_by_sents(25000) #get a sample of tokens
	fd = nltk.probability.FreqDist(tokens) #build a frequency distribution over tokens
	probability_distribution = nltk.probability.LidstoneProbDist(fd, 0.001) #build a probability distribution

	#Create a spell checker with new probability distribution
	s = SpellChecker(probability_distribution)

language-engineering / gist:3859257

Created October 9, 2012 14:42

	import gzip, os

	#Create an empty set ready to be filled with dictionary terms
	urban_dictionary = set()

	#Get a file pointer to the compressed file containing urban dictionary terms
	f = gzip.open(os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','UrbanDictionary','terms.gz'))

	#Fill set with urban dictionary entries
	for line in f:

language-engineering / gist:3858818

Last active October 11, 2015 12:27

	import os, collections, nltk

	class SpellChecker(object):

	def __init__(self, probability_distribution=None):
	if probability_distribution:
	self.probabilities = probability_distribution
	else:
	#when working form home, the path below must be changed to reflect the location of the gutenberg data on your home machine
	gutenberg_spelling_training = os.path.join('t:\\','Departments','Informatics','LanguageEngineering','data','gutenberg','spelling.txt')

language-engineering / gist:3858513

Last active October 11, 2015 12:27

	from sussex_nltk.corpus_readers import ReutersCorpusReader

	rcr = ReutersCorpusReader() #Create a new reader
	for sentence in rcr.sample_raw_sents(10): #get 10 random sentences, where each sentence is a string
	# do something with sentence

language-engineering / gist:3835060

Created October 4, 2012 17:17

	from sussex_nltk.corpus_readers import ReutersCorpusReader
	from sussex_nltk.stats import expected_token_freq

	rcr = ReutersCorpusReader()
	sample_size = 1000 #The number of sentences in a sample

	#Randomly sample 1000 sentences, and get a list of the tokens in those sentences
	tokens = rcr.sample_words_by_sents(sample_size)

	#Calculate and print the expected token frequency for this one sample of tokens for the token "elephant"

language-engineering / gist:3828658

Created October 3, 2012 18:04

	import csv

	#Provide a list, where every element in the list corresponds to a row of the spreadsheet
	#Every element in the list is another list, whose elements correspond to the columns of that row
	data = [[2,3,3],[4,3,5],[2,1,4]]

	#Write the data to a CSV file, which a spreadsheet program can open
	with open("file_name.csv","wb") as outputfile:
	writer = csv.writer(outputfile)
	writer.writerows(data)

language-engineering / gist:3828258

Created October 3, 2012 16:51

	from sussex_nltk.stats import expected_sentiment_tokens, normalised_lexical_diversity, prob_short_sents

	#Ensure that you correctly pass either a list of tokens, or a list of sentences (see comments below)

	#This function requires a list of tokens acquired from the "sample_words_by_sents" function on a corpus reader
	print "Expected number of sentiment tokens per 500 tokens: %s" % expected_sentiment_tokens(tokens)

	#This function requires a list of tokens acquired from the "sample_words_by_sents" function
	print "Normalised lexical diversity: %s" % normalised_lexical_diversity(tokens)