This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
example_dict = {} #creating an empty dictionary | |
# Only need to do this if we haven't already seen 'blue' | |
if 'blue' not in example_dict: | |
example_dict["blue"] = set() #Mapping "blue" to an empty set | |
example_dict["blue"].add("JJ") #Adding "JJ" to "blue"'s empty set | |
example_dict["blue"].add("NN") #Adding "NN" to "blue"'s empty set | |
#if you call the above line twice, only one "NN" will be in the set, because sets don't duplicate elements. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import sample | |
def split_data(data, ratio=0.7): | |
data = list(data) | |
n = len(data) #Found out number of samples present | |
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices | |
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices | |
training_data = [data[i] for i in train_indices] #Use training indices to select data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dvd_test, dvd_training = get_training_testing("dvd", feature_extractor, 0.7) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader | |
def get_training_testing(category, feature_extractor=None, split=0.7): | |
''' | |
Helper function. Splits data evenly across positive and negative, and then formats it | |
ready for naive bayes. You can also optionally pass in your custom feature extractor | |
(see next section), and a custom split ratio. | |
''' | |
arcr = AmazonReviewCorpusReader() | |
pos_train, pos_test = split_data(arcr.positive().category(category).documents(), split) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ID FORM POS HEAD DEPREL | |
1 the DT 2 det | |
2 cat NN 3 nsubj | |
3 sat VBD 0 root | |
4 on IN 3 prep | |
5 the DT 6 det | |
6 mat NN 4 pobj | |
7 . . 3 punct |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This code assumes you have the *parsed_sents* and *verb_variants* variables | |
# from the previous section. *parsed_sents* is a list of ParsedSentence objects | |
# Print to screen the parsed sentences | |
for sentence in parsed_sents: # *parsed_sents* acquired from the previous section | |
print "-----" # Just a separator | |
print sentence | |
# Each sentence is made up of a list of BasicToken objects | |
# Each token has several attributes: id, form (the actual word), pos, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.tag import twitter_tag_batch | |
from sussex_nltk.corpus_readers import TwitterCorpusReader | |
from sussex_nltk.parse import dep_parse_sentences_arceager | |
tcr = TwitterCorpusReader() | |
# Get some (here 30) un-tokenised sentences from tweets | |
sents = tcr.sample_raw_sents(30) | |
# PoS tag the sentences (remember the twitter tagger |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.corpus_readers import TwitterCorpusReader | |
from sussex_nltk.parse import dep_parse_sentences_arceager | |
from nltk.tokenize import word_tokenize | |
from nltk import pos_tag | |
tcr = TwitterCorpusReader() | |
# Get some (here 30) un-tokenised sentences from tweets | |
sents = tcr.sample_raw_sents(30) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.parse import dep_parse_sentences_arceager # Import the function which parses an iterable of sentences | |
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader # Import the corpus reader | |
from nltk import pos_tag # Import the PoS tagging function | |
# Create a list of reviews that contain the verb "to buy", | |
# by filtering for several of its conjugations | |
sentences = [] | |
verb_variants = set(["buy","buys","bought"]) | |
# You can use any product category (or even all product categories), |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sussex_nltk.corpus_readers import ReutersCorpusReader | |
from sussex_nltk.tag import twitter_tag_batch | |
from nltk import pos_tag | |
from nltk.tokenize import word_tokenize | |
number_of_sentences = 10 #Number of sentences to sample and display | |
rcr = ReutersCorpusReader() #Create a corpus reader | |
sentences = rcr.sample_raw_sents(number_of_sentences) #Sample some sentences |