johngrimes · July 9, 2024 04:28
diff --git a/1.Condition.json b/1.Condition.json
 {
  "resourceType": "Condition",
  "id": "1",
  "clinicalStatus": {
    "coding": [
      {
        "system": "http://terminology.hl7.org/CodeSystem/condition-clinical",
        "code": "active"
      }
    ]
  },
  "subject": {
    "reference": "Patient/13866099-28d7-1249-3e44-ecb490d40fef"
  },
  "onsetDateTime": "1998-11-08T01:26:08+10:00"
 }
diff --git a/2.Condition.json b/2.Condition.json
 {
  "resourceType": "Condition",
  "id": "2",
  "verificationStatus": {
    "coding": [
      {
        "system": "http://terminology.hl7.org/CodeSystem/condition-ver-status",
        "code": "confirmed"
      }
    ]
  },
  "subject": {
    "identifier": {
      "system": "http://example.org",
      "value": "123456"
    },
    "display": "Mr. Test Patient"
  },
  "recordedDate": "1998-11-08T01:26:08+10:00"
 }
diff --git a/merging.py b/merging.py
 from delta.tables import DeltaTable
 from pyspark.sql import SparkSession

 spark = (
    SparkSession.builder.config(
        "spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0"
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .config("spark.databricks.delta.schema.autoMerge.enabled", "true")
    .getOrCreate()
 )

 df1 = DeltaTable.forPath(spark, "data/delta/1.Condition.parquet")
 df1.toDF().printSchema()

 df2 = DeltaTable.forPath(spark, "data/delta/2.Condition.parquet")
 df2.toDF().printSchema()

 df1.alias("old").merge(
    df2.alias("new").toDF(), "new.id = old.id"
 ).whenMatchedUpdateAll().execute()

 df3 = DeltaTable.forPath(spark, "data/delta/1.Condition.parquet")
 df3.toDF().printSchema()
diff --git a/merging_output.txt b/merging_output.txt
 root
 |-- clinicalStatus: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- system: string (nullable = true)
 |-- id: string (nullable = true)
 |-- onsetDateTime: string (nullable = true)
 |-- resourceType: string (nullable = true)
 |-- subject: struct (nullable = true)
 |    |-- reference: string (nullable = true)



 root
 |-- id: string (nullable = true)
 |-- recordedDate: string (nullable = true)
 |-- resourceType: string (nullable = true)
 |-- subject: struct (nullable = true)
 |    |-- display: string (nullable = true)
 |    |-- identifier: struct (nullable = true)
 |    |    |-- system: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- verificationStatus: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- system: string (nullable = true)



 24/07/09 11:45:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
 root
 |-- clinicalStatus: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- system: string (nullable = true)
 |-- id: string (nullable = true)
 |-- onsetDateTime: string (nullable = true)
 |-- resourceType: string (nullable = true)
 |-- subject: struct (nullable = true)
 |    |-- reference: string (nullable = true)
 |    |-- display: string (nullable = true)
 |    |-- identifier: struct (nullable = true)
 |    |    |-- system: string (nullable = true)
 |    |    |-- value: string (nullable = true)
 |-- recordedDate: string (nullable = true)
 |-- verificationStatus: struct (nullable = true)
 |    |-- coding: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- code: string (nullable = true)
 |    |    |    |-- system: string (nullable = true)
diff --git a/querying.py b/querying.py
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import lit

 spark = SparkSession.builder.getOrCreate()

 df1 = spark.read.parquet("data/parquet/1.Condition.parquet")
 schema = df1.schema
 df1.printSchema()


 def safe_column(df, col_name):
    if col_name in schema.names:
        return df[col_name]
    else:
        return lit(None).alias(col_name)


 result = df1.select(
    safe_column(df1, "id"), safe_column(df1, "foo"), safe_column(df1, "onsetDateTime")
 )

 result.show(truncate=False)
diff --git a/querying_output.txt b/querying_output.txt
 +---+----+-------------------------+
 |id |foo |onsetDateTime            |
 +---+----+-------------------------+
 |1  |NULL|1998-11-08T01:26:08+10:00|
 +---+----+-------------------------+
diff --git a/write_delta.py b/write_delta.py
 from pyspark.sql import SparkSession

 spark = (
    SparkSession.builder.config(
        "spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0"
    )
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
    .getOrCreate()
 )

 df1 = spark.read.json("data/json/1.Condition.json", multiLine=True)
 df1.printSchema()
 df1.write.format("delta").save("data/delta/1.Condition.parquet", mode="overwrite")

 df2 = spark.read.json("data/json/2.Condition.json", multiLine=True)
 df2.printSchema()
 df2.write.format("delta").save("data/delta/2.Condition.parquet", mode="overwrite")
	{
	"resourceType": "Condition",
	"id": "1",
	"clinicalStatus": {
	"coding": [
	{
	"system": "http://terminology.hl7.org/CodeSystem/condition-clinical",
	"code": "active"
	}
	]
	},
	"subject": {
	"reference": "Patient/13866099-28d7-1249-3e44-ecb490d40fef"
	},
	"onsetDateTime": "1998-11-08T01:26:08+10:00"
	}
	{
	"resourceType": "Condition",
	"id": "2",
	"verificationStatus": {
	"coding": [
	{
	"system": "http://terminology.hl7.org/CodeSystem/condition-ver-status",
	"code": "confirmed"
	}
	]
	},
	"subject": {
	"identifier": {
	"system": "http://example.org",
	"value": "123456"
	},
	"display": "Mr. Test Patient"
	},
	"recordedDate": "1998-11-08T01:26:08+10:00"
	}
	from delta.tables import DeltaTable
	from pyspark.sql import SparkSession

	spark = (
	SparkSession.builder.config(
	"spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0"
	)
	.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
	.config(
	"spark.sql.catalog.spark_catalog",
	"org.apache.spark.sql.delta.catalog.DeltaCatalog",
	)
	.config("spark.databricks.delta.schema.autoMerge.enabled", "true")
	.getOrCreate()
	)

	df1 = DeltaTable.forPath(spark, "data/delta/1.Condition.parquet")
	df1.toDF().printSchema()

	df2 = DeltaTable.forPath(spark, "data/delta/2.Condition.parquet")
	df2.toDF().printSchema()

	df1.alias("old").merge(
	df2.alias("new").toDF(), "new.id = old.id"
	).whenMatchedUpdateAll().execute()

	df3 = DeltaTable.forPath(spark, "data/delta/1.Condition.parquet")
	df3.toDF().printSchema()
	root
	\|-- clinicalStatus: struct (nullable = true)
	\| \|-- coding: array (nullable = true)
	\| \| \|-- element: struct (containsNull = true)
	\| \| \| \|-- code: string (nullable = true)
	\| \| \| \|-- system: string (nullable = true)
	\|-- id: string (nullable = true)
	\|-- onsetDateTime: string (nullable = true)
	\|-- resourceType: string (nullable = true)
	\|-- subject: struct (nullable = true)
	\| \|-- reference: string (nullable = true)



	root
	\|-- id: string (nullable = true)
	\|-- recordedDate: string (nullable = true)
	\|-- resourceType: string (nullable = true)
	\|-- subject: struct (nullable = true)
	\| \|-- display: string (nullable = true)
	\| \|-- identifier: struct (nullable = true)
	\| \| \|-- system: string (nullable = true)
	\| \| \|-- value: string (nullable = true)
	\|-- verificationStatus: struct (nullable = true)
	\| \|-- coding: array (nullable = true)
	\| \| \|-- element: struct (containsNull = true)
	\| \| \| \|-- code: string (nullable = true)
	\| \| \| \|-- system: string (nullable = true)



	24/07/09 11:45:33 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
	root
	\|-- clinicalStatus: struct (nullable = true)
	\| \|-- coding: array (nullable = true)
	\| \| \|-- element: struct (containsNull = true)
	\| \| \| \|-- code: string (nullable = true)
	\| \| \| \|-- system: string (nullable = true)
	\|-- id: string (nullable = true)
	\|-- onsetDateTime: string (nullable = true)
	\|-- resourceType: string (nullable = true)
	\|-- subject: struct (nullable = true)
	\| \|-- reference: string (nullable = true)
	\| \|-- display: string (nullable = true)
	\| \|-- identifier: struct (nullable = true)
	\| \| \|-- system: string (nullable = true)
	\| \| \|-- value: string (nullable = true)
	\|-- recordedDate: string (nullable = true)
	\|-- verificationStatus: struct (nullable = true)
	\| \|-- coding: array (nullable = true)
	\| \| \|-- element: struct (containsNull = true)
	\| \| \| \|-- code: string (nullable = true)
	\| \| \| \|-- system: string (nullable = true)
	from pyspark.sql import SparkSession
	from pyspark.sql.functions import lit

	spark = SparkSession.builder.getOrCreate()

	df1 = spark.read.parquet("data/parquet/1.Condition.parquet")
	schema = df1.schema
	df1.printSchema()


	def safe_column(df, col_name):
	if col_name in schema.names:
	return df[col_name]
	else:
	return lit(None).alias(col_name)


	result = df1.select(
	safe_column(df1, "id"), safe_column(df1, "foo"), safe_column(df1, "onsetDateTime")
	)

	result.show(truncate=False)
	+---+----+-------------------------+
	\|id \|foo \|onsetDateTime \|
	+---+----+-------------------------+
	\|1 \|NULL\|1998-11-08T01:26:08+10:00\|
	+---+----+-------------------------+