-
-
Save rizplate/4e2432c7e10ac5fb438bf5ea88a69c18 to your computer and use it in GitHub Desktop.
Example of python code to submit spark process as an emr step to AWS emr cluster in AWS lambda function
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import boto3 | |
def lambda_handler(event, context): | |
conn = boto3.client("emr") | |
# chooses the first cluster which is Running or Waiting | |
# possibly can also choose by name or already have the cluster id | |
clusters = conn.list_clusters() | |
# choose the correct cluster | |
clusters = [c["Id"] for c in clusters["Clusters"] | |
if c["Status"]["State"] in ["RUNNING", "WAITING"]] | |
if not clusters: | |
sys.stderr.write("No valid clusters\n") | |
sys.stderr.exit() | |
# take the first relevant cluster | |
cluster_id = clusters[0] | |
# code location on your emr master node | |
CODE_DIR = "/home/hadoop/code/" | |
# spark configuration example | |
step_args = ["/usr/bin/spark-submit", "--spark-conf", "your-configuration", | |
CODE_DIR + "your_file.py", '--your-parameters', 'parameters'] | |
step = {"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"), | |
'ActionOnFailure': 'CONTINUE', | |
'HadoopJarStep': { | |
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar', | |
'Args': step_args | |
} | |
} | |
action = conn.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step]) | |
return "Added step: %s"%(action) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment