Last active
August 29, 2015 13:57
-
-
Save yuta-imai/9519760 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
TARGET=$1 # for example '2014-03-10' or '2014-03-10-10' | |
KEYPAIR=$2 # ec2 keypair name for ssh | |
MASTER_INSTANCE_TYPE='m1.large' | |
SLAVE_INSTANCE_TYPE='m1.large' | |
NUM_INSTANCES='3' | |
LOG_URI='' # s3://bucketname/path/to/emrlogdir/ This can be different bucket other than cflog | |
S3ENDPOINT='s3-ap-northeast-1.amazonaws.com' # ap-northeast-1 is endpoint for Tokyo region. | |
CFLOG='' # s3://bucketname/path/to/cflogdir/ | |
HIVESCRIPT='' # s3://bucketname/path/to/table.hql Also this can be different bucket other than cflog | |
#EMRクラスタの起動 | |
FLOW=`elastic-mapreduce --create --alive --hive-interactive --key-pair ${KEYPAIR} --name emr-cluster-${TARGET} --master-instance-type ${MASTER_INSTANCE_TYPE} --slave-instance-type ${SLAVE_INSTANCE_TYPE} --num-instances ${NUM_INSTANCES} --log-uri ${LOG_URI} | awk '{print $4}'` | |
#S3DistCpステップの追加 | |
elastic-mapreduce --jobflow $FLOW --jar \ | |
/home/hadoop/lib/emr-s3distcp-1.0.jar \ | |
--arg --s3Endpoint --arg "$S3ENDPOINT" \ | |
--arg --src --arg "$CFLOG" \ | |
--arg --dest --arg 'hdfs:///data' \ | |
--arg --groupBy --arg '.*([0-9]{4}-[0-9]{2}-[0-9]{2}-[0-9]{2}).*' \ | |
--arg --targetSize --arg '128' \ | |
--arg --srcPattern --arg ".*${TARGET}.*" \ | |
--arg --outputCodec --arg 'lzo' \ | |
--step-name s3distcp | |
#Hiveテーブルに読み込み | |
elastic-mapreduce --jobflow $FLOW \ | |
--hive-script $HIVESCRIPT \ | |
--step-name table_construction |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment