Skip to content

Instantly share code, notes, and snippets.

@alanwill
Last active July 17, 2022 07:04
Show Gist options
  • Save alanwill/9f4d512817a8913962ac1df85cb6442a to your computer and use it in GitHub Desktop.
Save alanwill/9f4d512817a8913962ac1df85cb6442a to your computer and use it in GitHub Desktop.
AWS Glue JSON to Parquet transformation script
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
bucketpathparam = getResolvedOptions(sys.argv, ['s3_path'])
databasenameparam = getResolvedOptions(sys.argv, ['database_name'])
tablenameparam = getResolvedOptions(sys.argv, ['table_name'])
# Construct referenceable paths
bucketpath = bucketpathparam['s3_path']
databasename = databasenameparam['database_name']
tablename = tablenameparam['table_name']
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "<database-name>", table_name = "<table-name>", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = databasename, table_name = tablename, transformation_ctx = "datasource0")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame = datasource0, transformation_ctx = "dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://<s3-bucket-name>"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
datasink4 = glueContext.write_dynamic_frame.from_options(frame = dropnullfields3, connection_type = "s3", connection_options = {"path": bucketpath, "partitionKeys": ["year", "month", "day"]}, format = "parquet", transformation_ctx = "datasink4")
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment