Last active
February 20, 2024 07:26
-
-
Save nmukerje/e65cde41be85470e4b8dfd9a2d6aed50 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.types import * | |
from pyspark.sql.functions import * | |
#Flatten array of structs and structs | |
def flatten(df): | |
# compute Complex Fields (Lists and Structs) in Schema | |
complex_fields = dict([(field.name, field.dataType) | |
for field in df.schema.fields | |
if type(field.dataType) == ArrayType or type(field.dataType) == StructType]) | |
while len(complex_fields)!=0: | |
col_name=list(complex_fields.keys())[0] | |
print ("Processing :"+col_name+" Type : "+str(type(complex_fields[col_name]))) | |
# if StructType then convert all sub element to columns. | |
# i.e. flatten structs | |
if (type(complex_fields[col_name]) == StructType): | |
expanded = [col(col_name+'.'+k).alias(col_name+'_'+k) for k in [ n.name for n in complex_fields[col_name]]] | |
df=df.select("*", *expanded).drop(col_name) | |
# if ArrayType then add the Array Elements as Rows using the explode function | |
# i.e. explode Arrays | |
elif (type(complex_fields[col_name]) == ArrayType): | |
df=df.withColumn(col_name,explode_outer(col_name)) | |
# recompute remaining Complex Fields in Schema | |
complex_fields = dict([(field.name, field.dataType) | |
for field in df.schema.fields | |
if type(field.dataType) == ArrayType or type(field.dataType) == StructType]) | |
return df | |
df=flatten(df) | |
df.printSchema() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This function flatten(), fails when there is nested array inside array, It failed to flatten these, "user_mentions": [
{
"screen_name": "AshJone15461246",
"name": "Ash Jones",
"id": 1589564629369462800,
"id_str": "1589564629369462784",
"indices": [
0,
16
]
},
{
"screen_name": "BariAWilliams",
"name": "Bärí A. Williams, Esq.",
"id": 4639656854,
"id_str": "4639656854",
"indices": [
17,
31
]
},
{
"screen_name": "bjorn_hefnoll",
"name": "Björn",
"id": 1374096417954881500,
"id_str": "1374096417954881548",
"indices": [
32,
46
]
},
{
"screen_name": "SpencerAlthouse",
"name": "Spencer Althouse",
"id": 38307346,
"id_str": "38307346",
"indices": [
47,
63
]
}
].