language-engineering’s gists

language-engineering / gist:3945886

Last active October 12, 2015 00:48

	from math import log
	from sussex_nltk.corpus_readers import WSJCorpusReader

	def get_entropy_ambiguity(word):
	# Get the PoS ambiguity of word according to its occurrence in WSJ
	pos_counts = {} # keep track of the number of times word
	# appears with each PoS tag
	for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ
	if token == word: # if this token is the word we're interested in
	try:

language-engineering / gist:3945658

Created October 24, 2012 11:47

	from nltk.util import bigrams, trigrams

	example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')]

	bi_grams = bigrams(example_tagged_words)
	tri_grams = trigrams(example_tagged_words)

	#You can even use "extract_by_pos" and "untag_sequence" on bigrams and trigrams

	bigram_regex = [("J+","N+")] #Pattern: all adjectives followed by nouns

language-engineering / gist:3945632

Created October 24, 2012 11:41

	from sussex_nltk import untag_sequence, extract_by_pos

	all_tags = r".+"
	all_nouns = r"N+"
	all_verbs = r"V+"
	all_adjectives = r"J+"

	example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')]

	#Decide on some patterns to match

language-engineering / gist:3945534

Created October 24, 2012 11:19

	from sussex_nltk import lemmatize_tagged, untag_sequence

	#Given your review object, you can get tagged words from it
	tagged_words = amazon_review.tagged_words()

	#Lemmatise the words (required tagged words)
	lemma_words = [lemmatize_tagged(tagged_word) for tagged_word in tagged_words]

	#Remove the PoS tags in order to use the lemmas as features
	features = untag_sequence(lemma_words)

language-engineering / gist:3945444

Created October 24, 2012 10:51

	from nltk import pos_tag
	from sussex_nltk import lemmatize_tagged
	from nltk.tag import untag

	#Example list of words
	words = ['The', 'badgers', 'were', 'eating', 'some', 'berries', 'and', 'jam']

	#PoS tag the words
	tagged_words = pos_tag(words)

language-engineering / gist:3945353

Created October 24, 2012 10:22

	from nltk.stem.porter import PorterStemmer

	stemmer = PorterStemmer() #Create a new stemmer
	stemmed = stemmer.stem("complications") #Example usage, stemming a single word

	#You will need to stem all of the words in a review,
	#this will require iterating over them with a loop or list comprehension

language-engineering / gist:3945220

Created October 24, 2012 09:53

	import nltk
	nltk.app.chunkparser()

language-engineering / gist:3939408

Last active October 11, 2015 23:58

	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	#Get some document ready for formatting
	dvd_pos = AmazonReviewCorpusReader().positive().category("dvd").documents()

	#Format the documents ready for NB classifier, but also pass in the feature extractor
	dvd_pos_formatted = format_data(dvd_pos, "pos", feature_extractor)

language-engineering / gist:3939339

Created October 23, 2012 15:15

	#Your function may start out like this, equivalent to what's been used to far
	#It takes a review, and just returns all the words in that review
	def feature_extractor(amazon_review):
	return amazon_review.words() #AmazonReview objects have a method words which simply returns all the words in the review

	# Below follows example functionality that you should include in your feature extractor

	#This code shows you how to get lowercase versions of all the words
	tokens = ['You', 'know', 'NOTHING,', 'Jon', 'Snow']
	print [token.lower() for token in tokens]

language-engineering / gist:3938671

Last active October 11, 2015 23:48

	import matplotlib
	matplotlib.use("Qt4Agg") # on OSX this needs to be matplotlib.use("MacOSX")
	import matplotlib.pyplot as plt
	import numpy as np

	def plot_results(results, title, xlabels, ylabel="Accuracy"):
	'''Plot a bar graph of results'''
	ind = np.arange(len(results))
	width = 0.4
	plt.bar(ind, results, width, color="#1AADA4")