Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
from math import log
from sussex_nltk.corpus_readers import WSJCorpusReader
def get_entropy_ambiguity(word):
# Get the PoS ambiguity of *word* according to its occurrence in WSJ
pos_counts = {} # keep track of the number of times *word*
# appears with each PoS tag
for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ
if token == word: # if this token is the word we're interested in
try:
from nltk.util import bigrams, trigrams
example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')]
bi_grams = bigrams(example_tagged_words)
tri_grams = trigrams(example_tagged_words)
#You can even use "extract_by_pos" and "untag_sequence" on bigrams and trigrams
bigram_regex = [("J+","N+")] #Pattern: all adjectives followed by nouns
from sussex_nltk import untag_sequence, extract_by_pos
all_tags = r".+"
all_nouns = r"N+"
all_verbs = r"V+"
all_adjectives = r"J+"
example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')]
#Decide on some patterns to match
from sussex_nltk import lemmatize_tagged, untag_sequence
#Given your review object, you can get tagged words from it
tagged_words = amazon_review.tagged_words()
#Lemmatise the words (required tagged words)
lemma_words = [lemmatize_tagged(tagged_word) for tagged_word in tagged_words]
#Remove the PoS tags in order to use the lemmas as features
features = untag_sequence(lemma_words)
from nltk import pos_tag
from sussex_nltk import lemmatize_tagged
from nltk.tag import untag
#Example list of words
words = ['The', 'badgers', 'were', 'eating', 'some', 'berries', 'and', 'jam']
#PoS tag the words
tagged_words = pos_tag(words)
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer() #Create a new stemmer
stemmed = stemmer.stem("complications") #Example usage, stemming a single word
#You will need to stem all of the words in a review,
#this will require iterating over them with a loop or list comprehension
import nltk
nltk.app.chunkparser()
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
#Get some document ready for formatting
dvd_pos = AmazonReviewCorpusReader().positive().category("dvd").documents()
#Format the documents ready for NB classifier, but also pass in the feature extractor
dvd_pos_formatted = format_data(dvd_pos, "pos", feature_extractor)
#Your function may start out like this, equivalent to what's been used to far
#It takes a review, and just returns all the words in that review
def feature_extractor(amazon_review):
return amazon_review.words() #AmazonReview objects have a method *words* which simply returns all the words in the review
# Below follows example functionality that you should include in your feature extractor
#This code shows you how to get lowercase versions of all the words
tokens = ['You', 'know', 'NOTHING,', 'Jon', 'Snow']
print [token.lower() for token in tokens]
import matplotlib
matplotlib.use("Qt4Agg") # on OSX this needs to be matplotlib.use("MacOSX")
import matplotlib.pyplot as plt
import numpy as np
def plot_results(results, title, xlabels, ylabel="Accuracy"):
'''Plot a bar graph of results'''
ind = np.arange(len(results))
width = 0.4
plt.bar(ind, results, width, color="#1AADA4")