Created
September 21, 2015 15:24
-
-
Save meconlin/fcd0129da4d6f7e8b6ba to your computer and use it in GitHub Desktop.
aws data pipeline unzip from s3 -> to s3 filtering csv via awk
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"objects": [ | |
{ | |
"directoryPath": "#{myS3OutputLoc}/", | |
"name": "S3OutputLocation", | |
"id": "S3OutputLocation", | |
"type": "S3DataNode" | |
}, | |
{ | |
"period": "1 day", | |
"name": "Every 1 day", | |
"id": "DefaultSchedule", | |
"type": "Schedule", | |
"startAt": "FIRST_ACTIVATION_DATE_TIME" | |
}, | |
{ | |
"directoryPath": "#{myS3InputLoc}", | |
"name": "S3InputLocation", | |
"id": "S3InputLocation", | |
"type": "S3DataNode" | |
}, | |
{ | |
"output": { | |
"ref": "S3OutputLocation" | |
}, | |
"input": { | |
"ref": "S3InputLocation" | |
}, | |
"stage": "true", | |
"name": "ShellCommandActivityObj", | |
"id": "ShellCommandActivityObj", | |
"runsOn": { | |
"ref": "EC2ResourceObj" | |
}, | |
"type": "ShellCommandActivity", | |
"command": "#{myShellCmd}" | |
}, | |
{ | |
"failureAndRerunMode": "CASCADE", | |
"schedule": { | |
"ref": "DefaultSchedule" | |
}, | |
"resourceRole": "DataPipelineDefaultResourceRole", | |
"role": "DataPipelineDefaultRole", | |
"pipelineLogUri": "s3://carlingo.datapipeline.logs/", | |
"scheduleType": "cron", | |
"name": "Default", | |
"id": "Default" | |
}, | |
{ | |
"instanceType": "t1.micro", | |
"name": "EC2ResourceObj", | |
"id": "EC2ResourceObj", | |
"type": "Ec2Resource", | |
"terminateAfter": "20 Minutes" | |
} | |
], | |
"parameters": [ | |
{ | |
"description": "S3 output folder", | |
"id": "myS3OutputLoc", | |
"type": "AWS::S3::ObjectKey" | |
}, | |
{ | |
"default": "s3://us-east-1.elasticmapreduce.samples/pig-apache-logs/data", | |
"description": "S3 input folder", | |
"id": "myS3InputLoc", | |
"type": "AWS::S3::ObjectKey" | |
}, | |
{ | |
"default": "grep -rc \"GET\" ${INPUT1_STAGING_DIR}/* > ${OUTPUT1_STAGING_DIR}/output.txt", | |
"description": "Shell command to run", | |
"id": "myShellCmd", | |
"type": "String" | |
} | |
], | |
"values": { | |
"myShellCmd": "unzip -p ${INPUT1_STAGING_DIR}/DataOne_US_LDV_Data.zip VIN_REFERENCE.csv | awk -F '\",\"' '$4 >= 2008 { print }' > ${OUTPUT1_STAGING_DIR}/vin_reference.csv", | |
"myS3InputLoc": "s3://carlingo.datapipeline.data/staging/", | |
"myS3OutputLoc": "s3://carlingo.datapipeline.data/staging/extracted" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment