Last active
May 22, 2023 19:42
-
-
Save fahadsiddiqui/d5cff15698f9dc57e2dd7d7052c6cc43 to your computer and use it in GitHub Desktop.
Flatten a nested JSON Spark DataFrame using Scala, Spark 2.2.+ — a custom solution.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def flattenDataFrame(df: DataFrame): DataFrame = { | |
val fields = df.schema.fields | |
val fieldNames = fields.map(x => x.name) | |
for (i <- fields.indices) { | |
val field = fields(i) | |
val fieldType = field.dataType | |
val fieldName = field.name | |
fieldType match { | |
case _: ArrayType => | |
val fieldNamesExcludingArray = fieldNames.filter(_ != fieldName) | |
val fieldNamesAndExplode = fieldNamesExcludingArray ++ Array( | |
s"explode_outer($fieldName) as $fieldName" | |
) | |
val explodedDf = df.selectExpr(fieldNamesAndExplode: _*) | |
return flattenDataFrame(explodedDf) | |
case structType: StructType => | |
val childFieldNames = | |
structType.fieldNames.map(childname => fieldName + "." + childname) | |
val newFieldNames = fieldNames.filter(_ != fieldName) ++ childFieldNames | |
import org.apache.spark.sql.functions.col | |
val renamedCols = | |
newFieldNames.map { x => | |
col(x.toString).as(x.toString.replace(".", "_")) | |
} | |
val explodedDf = df.select(renamedCols: _*) | |
return flattenDataFrame(explodedDf) | |
case _ => | |
} | |
} | |
df | |
} |
is there a way we can modify this code to get the parents as well and not just the final nodes? ex: get columns a, b, a.child, b.child
Hello Fahad, this is interesting
we do have similar one in our website:
https://www.24tutorials.com/spark/flatten-json-spark-dataframe/
Hope this will be helpful.
Thanks.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@fahadsiddiqui How to deflatern this df into nested json ..?