Skip to content

Instantly share code, notes, and snippets.

@JacobJohansen
Created July 1, 2020 20:33
Show Gist options
  • Save JacobJohansen/daf5d75e900efd47e26d1a492dc089df to your computer and use it in GitHub Desktop.
Save JacobJohansen/daf5d75e900efd47e26d1a492dc089df to your computer and use it in GitHub Desktop.
deltalake create presto / athena queryable table, table schema generated for deltalake table dataframe
%spark.pyspark
def fieldPair(field): return "{} {}".format(field.name, field.dataType.typeName())
sc.addPyFile("/home/hadoop/extrajars/delta-core_2.12-0.6.1.jar")
from delta.tables import *
delta_dim_path = "s3://delta_dim_path"
df_dim.write.format("delta").save(delta_dim_path)
tableDeltaDim = DeltaTable.forPath(path=delta_dim_path, sparkSession=spark)
tableDeltaDim.generate("symlink_format_manifest")
deltaDimSchema = tableDeltaDim.toDF().schema
# Zeppelin showing Data
# z.show(deltaDimSchema.fields)
# z.show(deltaDimSchema.fields[0])
# z.show(deltaDimSchema.fields[0].name)
# z.show(deltaDimSchema.fields[0].dataType)
# z.show(deltaDimSchema.fields[0].dataType.typeName())
dimColumns = list(map(fieldPair, deltaDimSchema.fields))
# Zeppelin showing Data
# z.show(dimColumns)
stringDimColumns = ' ,'.join(dimColumns)
# Zeppelin showing Data
# z.show(stringDimColumns)
spark.sql("""
CREATE EXTERNAL TABLE IF NOT EXISTS datalake.presto_dim({})
ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://delta_dim_path/_symlink_format_manifest/'
""".format(stringDimColumns))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment