language-engineering’s gists

language-engineering / gist:4045411

Last active October 12, 2015 15:08

	example_dict = {} #creating an empty dictionary

	# Only need to do this if we haven't already seen 'blue'
	if 'blue' not in example_dict:
	example_dict["blue"] = set() #Mapping "blue" to an empty set

	example_dict["blue"].add("JJ") #Adding "JJ" to "blue"'s empty set

	example_dict["blue"].add("NN") #Adding "NN" to "blue"'s empty set
	#if you call the above line twice, only one "NN" will be in the set, because sets don't duplicate elements.

language-engineering / gist:3986653

Created October 31, 2012 11:53

	from random import sample

	def split_data(data, ratio=0.7):
	data = list(data)

	n = len(data) #Found out number of samples present
	train_indices = sample(xrange(n), int(n * ratio)) #Randomly select training indices
	test_indices = list(set(xrange(n)) - set(train_indices)) #Randomly select testing indices

	training_data = [data[i] for i in train_indices] #Use training indices to select data

language-engineering / gist:3986611

Last active October 12, 2015 06:47

dvd_test, dvd_training = get_training_testing("dvd", feature_extractor, 0.7)

language-engineering / gist:3986589

Last active October 12, 2015 06:38

	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader

	def get_training_testing(category, feature_extractor=None, split=0.7):
	'''
	Helper function. Splits data evenly across positive and negative, and then formats it
	ready for naive bayes. You can also optionally pass in your custom feature extractor
	(see next section), and a custom split ratio.
	'''
	arcr = AmazonReviewCorpusReader()
	pos_train, pos_test = split_data(arcr.positive().category(category).documents(), split)

language-engineering / gist:3980252

Created October 30, 2012 13:42

language-engineering / gist:3980058

Last active October 12, 2015 05:48

	# This code assumes you have the parsed_sents and verb_variants variables
	# from the previous section. parsed_sents is a list of ParsedSentence objects

	# Print to screen the parsed sentences
	for sentence in parsed_sents: # parsed_sents acquired from the previous section
	print "-----" # Just a separator
	print sentence

	# Each sentence is made up of a list of BasicToken objects
	# Each token has several attributes: id, form (the actual word), pos,

language-engineering / gist:3979720

Last active October 12, 2015 05:48

	from sussex_nltk.tag import twitter_tag_batch
	from sussex_nltk.corpus_readers import TwitterCorpusReader
	from sussex_nltk.parse import dep_parse_sentences_arceager

	tcr = TwitterCorpusReader()

	# Get some (here 30) un-tokenised sentences from tweets
	sents = tcr.sample_raw_sents(30)

	# PoS tag the sentences (remember the twitter tagger

language-engineering / gist:3979698

Last active October 12, 2015 05:47

	from sussex_nltk.corpus_readers import TwitterCorpusReader
	from sussex_nltk.parse import dep_parse_sentences_arceager
	from nltk.tokenize import word_tokenize
	from nltk import pos_tag

	tcr = TwitterCorpusReader()

	# Get some (here 30) un-tokenised sentences from tweets
	sents = tcr.sample_raw_sents(30)

language-engineering / gist:3973277

Last active October 12, 2015 04:48

	from sussex_nltk.parse import dep_parse_sentences_arceager # Import the function which parses an iterable of sentences
	from sussex_nltk.corpus_readers import AmazonReviewCorpusReader # Import the corpus reader
	from nltk import pos_tag # Import the PoS tagging function

	# Create a list of reviews that contain the verb "to buy",
	# by filtering for several of its conjugations

	sentences = []
	verb_variants = set(["buy","buys","bought"])
	# You can use any product category (or even all product categories),

language-engineering / gist:3958445

Last active October 12, 2015 02:37

	from sussex_nltk.corpus_readers import ReutersCorpusReader
	from sussex_nltk.tag import twitter_tag_batch
	from nltk import pos_tag
	from nltk.tokenize import word_tokenize

	number_of_sentences = 10 #Number of sentences to sample and display
	rcr = ReutersCorpusReader() #Create a corpus reader

	sentences = rcr.sample_raw_sents(number_of_sentences) #Sample some sentences