Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rizplate/86bbff00242ac398b9f23f8668b29f60 to your computer and use it in GitHub Desktop.
Save rizplate/86bbff00242ac398b9f23f8668b29f60 to your computer and use it in GitHub Desktop.
Create EMR Cluster with a Wordcount Job as a Step in Boto3
import boto3
client = boto3.client(
'emr',
region_name='eu-west-1'
)
cmd = "hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount file:///etc/services /output"
emrcluster = client.run_job_flow(
Name='EMR Cluster with Boto',
LogUri='s3://<bucket>/logs/',
ReleaseLabel='emr-5.3.0',
Instances={
'InstanceGroups': [
{
'Name': "Master nodes",
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.medium',
'InstanceCount': 1,
},
{
'Name': "Slave nodes",
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm1.medium',
'InstanceCount': 2,
}
],
'Ec2KeyName': '<keyname>',
'KeepJobFlowAliveWhenNoSteps': True,
'TerminationProtected': False,
'Ec2SubnetId': 'subnet-<id>',
},
Steps=[
{
'Name': 'Wordcount Job',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': cmd.split()
}
}
],
VisibleToAllUsers=True,
JobFlowRole='EMR_EC2_DefaultRole',
ServiceRole='EMR_DefaultRole',
Tags=[
{
'Key': 'Name',
'Value': 'EMR with Boto',
},
{
'Key': 'TerminationVal',
'Value': 'OK',
},
],
)
print(
'ClusterID: {} , DateCreated: {} , RequestId: {}'
.format(
emrcluster['JobFlowId'],
emrcluster['ResponseMetadata']['HTTPHeaders']['date'],
emrcluster['ResponseMetadata']['RequestId']
)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment