Last active
October 2, 2018 12:18
-
-
Save bmc/5aeab84166596f8b6afb0e2fefc8eb88 to your computer and use it in GitHub Desktop.
Patch a printSchemaAsCode() method into Spark DataFrame, in Python and Scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.types import * | |
from pyspark.sql import DataFrame | |
def schema_as_code(schema, indentation=4): | |
def str_field(field, indentationLevel): | |
''' | |
Convert a field to string representation. | |
''' | |
field_indent_spaces = " " * ((indentationLevel + 1) * indentation) | |
field_prefix = '{0}StructField("{1}", '.format(field_indent_spaces, field.name) | |
if isinstance(field.dataType, StructType): | |
field_type = pretty_struct(field.dataType, indentationLevel + 1) | |
else: | |
field_type = '{0}()'.format(field.dataType) | |
return '{0}{1}, {2})'.format(field_prefix, field_type, field.nullable) | |
def pretty_struct(st, indentationLevel): | |
buf = "" | |
indent_spaces = " " * (indentationLevel * indentation) | |
prefix = indent_spaces + "StructType([\n" | |
field_indentation = " " * ((indentationLevel + 1) * indentation) | |
field_strings = [str_field(field, indentationLevel) for field in st.fields] | |
fields = ',\n'.join(field_strings) | |
return '{0}{1}\n{2}])'.format(prefix, fields, indent_spaces) | |
return pretty_struct(schema, 0) | |
def print_schema_as_code(self): | |
print(schema_as_code(self.schema)) | |
DataFrame.printSchemaAsCode = print_schema_as_code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql.types._ | |
import org.apache.spark.sql._ | |
object Implicits { | |
def schemaAsCode(schema: StructType, indentation: Int = 2): String = { | |
def prettyStruct(st: StructType, indentationLevel: Int): String = { | |
val indentSpaces = " " * (indentationLevel * indentation) | |
val prefix = s"${indentSpaces}StructType(List(\n" | |
val fieldIndentSpaces = " " * ((indentationLevel + 1) * indentation) | |
val fieldStrings: Seq[String] = for (field <- st.fields) yield { | |
val fieldPrefix = s"""${fieldIndentSpaces}StructField("${field.name}", """ | |
val fieldType = field.dataType match { | |
case st2: StructType => s"${prettyStruct(st2, indentationLevel + 1)}" | |
case _ => s"${field.dataType}" | |
} | |
s"$fieldPrefix$fieldType, ${field.nullable})" | |
} | |
val fields = fieldStrings.mkString(",\n") | |
s"$prefix$fields\n$indentSpaces))" | |
} | |
prettyStruct(schema, 0) | |
} | |
implicit class EnrichedDataFrame(val df: DataFrame) { | |
def printSchemaAsCode(): Unit = { | |
println(schemaAsCode(df.schema)) | |
} | |
} | |
} | |
import Implicits._ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment