Last active
August 29, 2015 14:14
-
-
Save larsxschneider/922b8aaf07bc9b71ff39 to your computer and use it in GitHub Desktop.
Run Impala query on EMR cluster and store results as CSV file on S3
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto.emr | |
from boto.s3.key import Key | |
from boto.s3.connection import S3Connection | |
aws_key = 'your-key' | |
aws_secret = 'your-secret' | |
bucket_name = 'your-bucket-name' | |
result_file_name = 'your-result-file-path' | |
s3_connection = S3Connection(aws_key, aws_secret, validate_certs=False) | |
bucket = s3_connection.get_bucket(bucket_name) | |
query = Key(bucket) | |
query.key = '{}.sql'.format(result_file_name) | |
query.set_contents_from_string(''' | |
SELECT * FROM my_table; | |
''') | |
query_runner = Key(bucket) | |
query_runner.key = '{}.runner'.format(result_file_name) | |
query_runner.set_contents_from_string(''' | |
REMOTE_QUERY_FILE=$1 | |
LOCAL_QUERY_FILE=./impala.query | |
LOCAL_OUTPUT_FILE=./output.csv | |
S3_OUTPUT_FILE=s3://{}/{}.csv | |
hadoop fs -copyToLocal $REMOTE_QUERY_FILE $LOCAL_QUERY_FILE | |
impala-shell --query_file $LOCAL_QUERY_FILE \ | |
--output_file $LOCAL_OUTPUT_FILE \ | |
--print_header \ | |
--delimited \ | |
--output_delimiter=',' | |
hadoop fs -rm $S3_OUTPUT_FILE | |
hadoop fs -mkdir -p $(dirname $S3_OUTPUT_FILE) | |
hadoop fs -copyFromLocal $LOCAL_OUTPUT_FILE $S3_OUTPUT_FILE | |
'''.format(bucket.name, result_file_name)) | |
emr_steps = [ | |
boto.emr.step.ScriptRunnerStep( | |
name='Run query {}'.format(result_file_name), | |
step_args=[ | |
's3://{}/{}'.format(bucket.name, query_runner.key), | |
's3://{}/{}'.format(bucket.name, query.key) | |
] | |
) | |
] | |
emr_connection = boto.emr.connection.EmrConnection( | |
aws_key, | |
aws_secret, | |
region=emr_region('us-east-1'), | |
) | |
emr_connection.run_jobflow( | |
name='Cluster name', | |
log_uri='s3://{}/logs'.format(bucket.name), | |
ec2_keyname='my_key', | |
master_instance_type='m3.xlarge', | |
slave_instance_type='m3.2xlarge', | |
num_instances=2, | |
action_on_failure='CANCEL_AND_WAIT', | |
keep_alive=True, | |
ami_version='3.3.1', | |
bootstrap_actions=[ | |
boto.emr.BootstrapAction( | |
'Install Impala', | |
's3://us-east-1.elasticmapreduce/libs/impala/setup-impala', | |
[ | |
'--install-impala', | |
'--base-path', 's3://us-east-1.elasticmapreduce', | |
'--impala-version', 'latest', | |
'IMPALA_BACKEND_PORT=22001', | |
'IMPALA_MEM_LIMIT=90%', | |
] | |
)], | |
steps=emr_steps, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment