Skip to content

Instantly share code, notes, and snippets.

@liprais
Last active November 5, 2018 10:17
Show Gist options
  • Save liprais/beeaf927d49612e77fd5ad6f47c65836 to your computer and use it in GitHub Desktop.
Save liprais/beeaf927d49612e77fd5ad6f47c65836 to your computer and use it in GitHub Desktop.
a pyspark script to take a parquet file path and generate external table ddl for splice machine
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
data_type_map = {'StringType':'varchar(32672)',
'LongType':'bigint',
'IntegerType':'int',
'BooleanType':'boolean',
'DateType':'date',
'DoubleType':'double',
'TimestampType':'timestamp'
}
parquet_file_path = 'lineitem.parquet'
schema = spark.read.parquet(parquet_file_path).schema
col_defs_list = []
for i in schema.fieldNames():
col = schema[i]
data_type = col.dataType
nullable = col.nullable
name = col.name
if str(data_type) in data_type_map:
splicemachine_datatype = data_type_map[str(data_type)]
elif str(data_type):
splicemachine_datatype = str(data_type).replace('Type','')
if nullable:
col_def = name + " " + splicemachine_datatype
else:
col_def = name + " " + splicemachine_datatype + " not null"
col_defs_list.append(col_def)
col_defs_str = ''',
'''.join(col_defs_list)
print("create external table test ( ")
print(col_defs_str)
print(')')
print('stored as parquet')
print('location ' + "'" + parquet_file_path + "'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment