jeremykarn · September 25, 2013 15:17
diff --git a/nltk.pig b/nltk.pig
 REGISTER ‘<python_file>’ USING streaming_python AS nltk_udfs;

 tweets =  LOAD 's3n://twitter-gardenhose-mortar/tweets' 
         USING org.apache.pig.piggybank.storage.JsonLoader(
                  'text: chararray, place:tuple(name:chararray)');

 -- Group the tweets by place name and use a CPython UDF to find the top 5 bigrams
 -- for each of these places.
 bigrams_by_place = FOREACH (GROUP tweets BY place.name) GENERATE
                        group AS place:chararray, 
                        nltk_udfs.top_5_bigrams(tweets.text), 
                        COUNT(tweets) AS sample_size;

 top_100_places = LIMIT (ORDER bigrams_by_place BY sample_size DESC) 100;

 STORE top_100_places INTO '<your_output_path>';
diff --git a/nltk.py b/nltk.py
 from pig_util import outputSchema
 import nltk

 @outputSchema("top_five:bag{t:(bigram:chararray)}")
 def top_5_bigrams(tweets):
    tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]

    bgm    = nltk.collocations.BigramAssocMeasures()
    finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
    top_5  = finder.nbest(bgm.likelihood_ratio, 5)
    
    return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]
	REGISTER ‘<python_file>’ USING streaming_python AS nltk_udfs;

	tweets = LOAD 's3n://twitter-gardenhose-mortar/tweets'
	USING org.apache.pig.piggybank.storage.JsonLoader(
	'text: chararray, place:tuple(name:chararray)');

	-- Group the tweets by place name and use a CPython UDF to find the top 5 bigrams
	-- for each of these places.
	bigrams_by_place = FOREACH (GROUP tweets BY place.name) GENERATE
	group AS place:chararray,
	nltk_udfs.top_5_bigrams(tweets.text),
	COUNT(tweets) AS sample_size;

	top_100_places = LIMIT (ORDER bigrams_by_place BY sample_size DESC) 100;

	STORE top_100_places INTO '<your_output_path>';
	from pig_util import outputSchema
	import nltk

	@outputSchema("top_five:bag{t:(bigram:chararray)}")
	def top_5_bigrams(tweets):
	tokenized_tweets = [ nltk.tokenize.WhitespaceTokenizer().tokenize(t[0]) for t in tweets ]

	bgm = nltk.collocations.BigramAssocMeasures()
	finder = nltk.collocations.BigramCollocationFinder.from_documents(tokenized_tweets)
	top_5 = finder.nbest(bgm.likelihood_ratio, 5)

	return [ ("%s %s" % (s[0], s[1]),) for s in top_5 ]