joshy · July 9, 2014 13:01
diff --git a/alice-word-count b/alice-word-count
 -- Read the whole book and save it to input_lines
 input_lines = LOAD 'alice-im-wunderland.txt' using TextLoader() as (line:chararray);

 -- Extract words from each line and put them into a pig bag
 -- datatype, then flatten the bag to get one word on each row
 words = foreach input_lines generate flatten(TOKENIZE(line)) AS word;

 -- filter out any words that are just white spaces
 filtered_words = FILTER words by word MATCHES '\\w+';

 -- filter out all small words
 filtered_long_words = FILTER filtered_words by SIZE(word) > 3;


 -- create a group for each word
 word_groups = GROUP filtered_long_words BY word;

 -- count the entries in each group
 word_count = FOREACH word_groups generate COUNT(filtered_long_words) as count, group as word;

 -- sort the word counts be highest number first
 word_count_sorted = ORDER word_count by count DESC;

 --dump filtered_words;
 dump word_count_sorted;
	-- Read the whole book and save it to input_lines
	input_lines = LOAD 'alice-im-wunderland.txt' using TextLoader() as (line:chararray);

	-- Extract words from each line and put them into a pig bag
	-- datatype, then flatten the bag to get one word on each row
	words = foreach input_lines generate flatten(TOKENIZE(line)) AS word;

	-- filter out any words that are just white spaces
	filtered_words = FILTER words by word MATCHES '\\w+';

	-- filter out all small words
	filtered_long_words = FILTER filtered_words by SIZE(word) > 3;


	-- create a group for each word
	word_groups = GROUP filtered_long_words BY word;

	-- count the entries in each group
	word_count = FOREACH word_groups generate COUNT(filtered_long_words) as count, group as word;

	-- sort the word counts be highest number first
	word_count_sorted = ORDER word_count by count DESC;

	--dump filtered_words;
	dump word_count_sorted;