Skip to content

Instantly share code, notes, and snippets.

@michael-erasmus
Created October 30, 2014 14:10
Show Gist options
  • Save michael-erasmus/935c652b281930c07805 to your computer and use it in GitHub Desktop.
Save michael-erasmus/935c652b281930c07805 to your computer and use it in GitHub Desktop.
transform_actions_taken.pig
REGISTER '../udfs/jython/actions_taken.py' USING jython AS actions_taken;
REGISTER '../udfs/python/actions_taken.py' USING streaming_python AS actions_taken1;
raw = load '$OUTPUT_PATH/extract-actions-taken'
using PigStorage()
as (
user_id:chararray,
visitor_id:chararray,
client_id:chararray,
last_modified:chararray,
user_joined_at:chararray,
date:chararray,
value:bag{t:tuple()},
extra_data:chararray
);
with_scopes = foreach raw generate
user_id,
visitor_id,
client_id,
actions_taken1.transform_date(last_modified) as last_modified,
actions_taken1.transform_date(date),
actions_taken1.transform_joined_at(user_joined_at),
actions_taken.pull_out_scopes(value) as scopes,
extra_data;
transformed = foreach with_scopes generate
user_id,
visitor_id,
client_id,
last_modified,
user_joined_at,
date,
flatten(scopes.$0) as scope1,
flatten(scopes.$1) as scope2,
flatten(scopes.$2) as scope3,
flatten(scopes.$3) as scope4,
flatten(scopes.$4) as scope5,
flatten(scopes.$5) as scope6,
flatten(scopes.$6) as scope7,
flatten(scopes.$7) as scope8,
flatten(scopes.$8) as scope9,
flatten(scopes.$9) as scope10,
extra_data;
-- Use gzip compression
set output.compression.enabled true;
set output.compression.codec org.apache.hadoop.io.compress.GzipCodec;
rmf $OUTPUT_PATH/transform-actions-taken;
store transformed into '$OUTPUT_PATH/transform-actions-taken' using PigStorage();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment