Last active
August 29, 2015 13:58
-
-
Save MicTech/10017509 to your computer and use it in GitHub Desktop.
Hadoop Pig playground
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#resources | |
https://pig.apache.org/ | |
http://hortonworks.com/hadoop-tutorial/how-to-use-basic-pig-commands/ | |
http://engineering.linkedin.com/open-source/introducing-datafu-open-source-collection-useful-apache-pig-udfs | |
http://pig.apache.org/docs/r0.7.0/piglatin_ref2.html | |
#terminal | |
hadoop dfs -copyFromLocal /tmp/NYSE_daily_prices_A.csv /NYSE_daily_prices_A.csv | |
###Following operation may not make sense for stack data, but I using them only for playing with Pig. | |
##pig | |
register /tmp/datafu-1.2.1-SNAPSHOT.jar | |
define percentile datafu.pig.stats.Quantile('0', '0.4'); | |
#load data | |
STOCK = LOAD 'hdfs://localhost:54310/NYSE_daily_prices_A.csv' using PigStorage(','); DESCRIBE STOCK; | |
STOCK = LOAD 'hdfs://localhost:54310/NYSE_daily_prices_A.csv' using PigStorage(',') AS (exchange:chararray, symbol:chararray, date:chararray, open:float, high:float, low:float, close:float, volume:int, adj_close:float); DESCRIBE STOCK; | |
#filter STOCK data | |
STOCK = FILTER STOCK BY open is not null; | |
#add column diff = open - close | |
STOCK_WITH_DIFF = FOREACH STOCK GENERATE exchange, symbol, date, open, high, low, close, volume, adj_close, (open - close) AS (diff:float); | |
#group by symbol | |
STOCK_GROUP_BY_SYMBOL = GROUP STOCK BY symbol; | |
#average open by symbol | |
STOCK_AVG_OPEN = FOREACH STOCK_GROUP_BY_SYMBOL GENERATE group as symbol, AVG(STOCK_WITH_DIFF.open); | |
#average open by symbol | |
STOCK_AVG_OPEN = FOREACH STOCK_GROUP_BY_SYMBOL GENERATE group as symbol, AVG(STOCK_WITH_DIFF.open); | |
#sum open by symbol | |
STOCK_SUM_OPEN = FOREACH STOCK_GROUP_BY_SYMBOL GENERATE group as symbol, SUM(STOCK.open); | |
#percentile for open | |
STOCK_PERCENTILE_OPEN = FOREACH STOCK GENERATE group as id, percentile(STOCK.open) as percentile01; | |
#useful commands | |
dump STOCK_A |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment