This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from math import log | |
from sussex_nltk.corpus_readers import WSJCorpusReader | |
def get_entropy_ambiguity(word): | |
# Get the PoS ambiguity of *word* according to its occurrence in WSJ | |
pos_counts = {} # keep track of the number of times *word* | |
# appears with each PoS tag | |
for token, tag in WSJCorpusReader().tagged_words(): #for each token and tag in WSJ | |
if token == word: # if this token is the word we're interested in | |
try: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.util import bigrams, trigrams | |
example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')] | |
bi_grams = bigrams(example_tagged_words) | |
tri_grams = trigrams(example_tagged_words) | |
#You can even use "extract_by_pos" and "untag_sequence" on bigrams and trigrams | |
bigram_regex = [("J+","N+")] #Pattern: all adjectives followed by nouns |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk import untag_sequence, extract_by_pos | |
all_tags = r".+" | |
all_nouns = r"N+" | |
all_verbs = r"V+" | |
all_adjectives = r"J+" | |
example_tagged_words = [('The', 'DT'), ('little', 'JJ'), ('badgers', 'NNS'), ('ate', 'VBP'), ('some', 'DT'), ('jam', 'NN')] | |
#Decide on some patterns to match |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk import lemmatize_tagged, untag_sequence | |
#Given your review object, you can get tagged words from it | |
tagged_words = amazon_review.tagged_words() | |
#Lemmatise the words (required tagged words) | |
lemma_words = [lemmatize_tagged(tagged_word) for tagged_word in tagged_words] | |
#Remove the PoS tags in order to use the lemmas as features | |
features = untag_sequence(lemma_words) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk import pos_tag | |
from sussex_nltk import lemmatize_tagged | |
from nltk.tag import untag | |
#Example list of words | |
words = ['The', 'badgers', 'were', 'eating', 'some', 'berries', 'and', 'jam'] | |
#PoS tag the words | |
tagged_words = pos_tag(words) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem.porter import PorterStemmer | |
stemmer = PorterStemmer() #Create a new stemmer | |
stemmed = stemmer.stem("complications") #Example usage, stemming a single word | |
#You will need to stem all of the words in a review, | |
#this will require iterating over them with a loop or list comprehension |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
nltk.app.chunkparser() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
#Get some document ready for formatting | |
dvd_pos = AmazonReviewCorpusReader().positive().category("dvd").documents() | |
#Format the documents ready for NB classifier, but also pass in the feature extractor | |
dvd_pos_formatted = format_data(dvd_pos, "pos", feature_extractor) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Your function may start out like this, equivalent to what's been used to far | |
#It takes a review, and just returns all the words in that review | |
def feature_extractor(amazon_review): | |
return amazon_review.words() #AmazonReview objects have a method *words* which simply returns all the words in the review | |
# Below follows example functionality that you should include in your feature extractor | |
#This code shows you how to get lowercase versions of all the words | |
tokens = ['You', 'know', 'NOTHING,', 'Jon', 'Snow'] | |
print [token.lower() for token in tokens] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib | |
matplotlib.use("Qt4Agg") # on OSX this needs to be matplotlib.use("MacOSX") | |
import matplotlib.pyplot as plt | |
import numpy as np | |
def plot_results(results, title, xlabels, ylabel="Accuracy"): | |
'''Plot a bar graph of results''' | |
ind = np.arange(len(results)) | |
width = 0.4 | |
plt.bar(ind, results, width, color="#1AADA4") |