Skip to content

Instantly share code, notes, and snippets.

@cagdas1
Last active December 11, 2020 08:48
Show Gist options
  • Save cagdas1/ddbef58c2d3a3fa92be4ddd15721fcd0 to your computer and use it in GitHub Desktop.
Save cagdas1/ddbef58c2d3a3fa92be4ddd15721fcd0 to your computer and use it in GitHub Desktop.
Mongo ETL w/Glue
import * as cdk from "@aws-cdk/core";
import * as glue from "@aws-cdk/aws-glue";
import * as s3 from "@aws-cdk/aws-s3";
import * as s3Deployment from "@aws-cdk/aws-s3-deployment";
import * as iam from "@aws-cdk/aws-iam";
import { replaceValues } from "./lib";
import { config } from "dotenv";
config();
const PYTHON_VERSION = "3";
const GLUE_VERSION = "1.0";
//This value must be glueetl for Apache Spark
const COMMAND_NAME = "glueetl";
const JDBC_PATH = "dependencies/cdata.jdbc.mongodb.jar";
const { RTK, MONGO_SERVER, MONGO_USER, MONGO_PASSWORD, MONGO_PORT, MONGO_SSL, MONGO_DATABASE, COLLECTIONS, BUCKET_NAME }= process.env;
class MongoGlueETLStack extends cdk.Stack {
constructor(scope: cdk.Construct, id: string, props?: cdk.StackProps) {
super(scope, id, props);
const s3Bucket = new s3.Bucket(this, "etl-bucket", {
bucketName: BUCKET_NAME,
removalPolicy: cdk.RemovalPolicy.DESTROY
});
const dependenciesDeployment = new s3Deployment.BucketDeployment(this, "dependencies-deployment", {
sources: [s3Deployment.Source.asset("../dependencies")],
destinationBucket: s3Bucket,
destinationKeyPrefix: "dependencies"
});
// Replace hardcoded values in script
replaceValues(
"scripts/script.py",
RTK as string,
MONGO_SERVER as string,
MONGO_USER as string,
MONGO_PASSWORD as string,
MONGO_PORT as string,
MONGO_SSL == "true" ? "True" : "False",
MONGO_DATABASE as string,
`s3://${BUCKET_NAME}/${MONGO_DATABASE as string}/`,
COLLECTIONS as string
);
const scriptsDeployment = new s3Deployment.BucketDeployment(this, "scripts-deployment", {
sources: [s3Deployment.Source.asset("scripts")],
destinationBucket: s3Bucket,
destinationKeyPrefix: "scripts"
});
const glueRole = new iam.Role(this, "glue-role", {
roleName: "glue-etl-role",
assumedBy: new iam.ServicePrincipal("glue.amazonaws.com"),
managedPolicies: [
iam.ManagedPolicy.fromAwsManagedPolicyName("AmazonS3FullAccess")
],
});
const glueJob = new glue.CfnJob(this, "glue-job", {
name: "glue-job",
role: glueRole.roleArn,
command: {
name: COMMAND_NAME,
pythonVersion: PYTHON_VERSION,
scriptLocation: `s3://${s3Bucket.bucketName}/scripts/script.py`
},
glueVersion: GLUE_VERSION,
defaultArguments: {
"--extra-jars": `s3://${s3Bucket.bucketName}/${JDBC_PATH}`
}
});
const glueTrigger = new glue.CfnTrigger(this, "glue-trigger", {
name: "etl-trigger",
schedule: "cron(5 * * * ? *)",
type: "SCHEDULED",
actions: [
{
jobName: glueJob.name
}
],
startOnCreation: true
});
glueTrigger.addDependsOn(glueJob);
}
}
const app = new cdk.App();
new MongoGlueETLStack(app, "MongoGlueETLStack", {
env: {
region: process.env.AWS_REGION,
account: process.env.AWS_ACCOUNT_ID
}
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment