Skip to content

Instantly share code, notes, and snippets.

View language-engineering's full-sized avatar

language-engineering

View GitHub Profile
example_dict = {} #creating an empty dictionary
# Only need to do this if we haven't already seen 'blue'
if 'blue' not in example_dict:
example_dict["blue"] = set() #Mapping "blue" to an empty set
example_dict["blue"].add("JJ") #Adding "JJ" to "blue"'s empty set
example_dict["blue"].add("NN") #Adding "NN" to "blue"'s empty set
#if you call the above line twice, only one "NN" will be in the set, because sets don't duplicate elements.
from random import sample
def split_data(data, ratio=0.7):
data = list(data)
n = len(data) #Found out number of samples present
train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices
training_data = [data[i] for i in train_indices] #Use training indices to select data
dvd_test, dvd_training = get_training_testing("dvd", feature_extractor, 0.7)
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader
def get_training_testing(category, feature_extractor=None, split=0.7):
'''
Helper function. Splits data evenly across positive and negative, and then formats it
ready for naive bayes. You can also optionally pass in your custom feature extractor
(see next section), and a custom split ratio.
'''
arcr = AmazonReviewCorpusReader()
pos_train, pos_test = split_data(arcr.positive().category(category).documents(), split)
ID FORM POS HEAD DEPREL
1 the DT 2 det
2 cat NN 3 nsubj
3 sat VBD 0 root
4 on IN 3 prep
5 the DT 6 det
6 mat NN 4 pobj
7 . . 3 punct
# This code assumes you have the *parsed_sents* and *verb_variants* variables
# from the previous section. *parsed_sents* is a list of ParsedSentence objects
# Print to screen the parsed sentences
for sentence in parsed_sents: # *parsed_sents* acquired from the previous section
print "-----" # Just a separator
print sentence
# Each sentence is made up of a list of BasicToken objects
# Each token has several attributes: id, form (the actual word), pos,
from sussex_nltk.tag import twitter_tag_batch
from sussex_nltk.corpus_readers import TwitterCorpusReader
from sussex_nltk.parse import dep_parse_sentences_arceager
tcr = TwitterCorpusReader()
# Get some (here 30) un-tokenised sentences from tweets
sents = tcr.sample_raw_sents(30)
# PoS tag the sentences (remember the twitter tagger
from sussex_nltk.corpus_readers import TwitterCorpusReader
from sussex_nltk.parse import dep_parse_sentences_arceager
from nltk.tokenize import word_tokenize
from nltk import pos_tag
tcr = TwitterCorpusReader()
# Get some (here 30) un-tokenised sentences from tweets
sents = tcr.sample_raw_sents(30)
from sussex_nltk.parse import dep_parse_sentences_arceager # Import the function which parses an iterable of sentences
from sussex_nltk.corpus_readers import AmazonReviewCorpusReader # Import the corpus reader
from nltk import pos_tag # Import the PoS tagging function
# Create a list of reviews that contain the verb "to buy",
# by filtering for several of its conjugations
sentences = []
verb_variants = set(["buy","buys","bought"])
# You can use any product category (or even all product categories),
from sussex_nltk.corpus_readers import ReutersCorpusReader
from sussex_nltk.tag import twitter_tag_batch
from nltk import pos_tag
from nltk.tokenize import word_tokenize
number_of_sentences = 10 #Number of sentences to sample and display
rcr = ReutersCorpusReader() #Create a corpus reader
sentences = rcr.sample_raw_sents(number_of_sentences) #Sample some sentences