ipeirotis · November 8, 2021 14:23
diff --git a/spark-cube-example.py b/spark-cube-example.py
 # We use the "CUBE" operator, to calculate the frequency of any 
 # attribute-value combination.
 #
 # Notice that we replace the NULL values with "N/A" before the CUBE operator. 
 # This is to avoid confusion with the way that CUBE uses NULL values to indicate 
 # "any value" for attribute combinations that do not use the available attributes 
 # of the cube. 

 df = (
  dataset
  .fillna("N/A")
  .cube(col("email"), col("phone"), col("mobile"), col("contact_name"), col("license_num"))
  .count()
  .orderBy("count", ascending=False)
 )

 # Calculates the number of attributes in the itemset 
 # (effectively number of attributes in the GROUP BY that are not NULL
 df = df.withColumn("itemset_size", num_attributes - sum(df[c].isNull().cast('int') for c in df.columns))

 # Calculates the number of attributes in the itemset that are equal to "N/A"
 df = df.withColumn("missing_values", sum(coalesce(df[c]=='N/A', lit(False)).cast('int') for c in df.columns))

 # The "itemsets" view contains the frequency of every "itemset" in the dataset. 
 df.createOrReplaceTempView("itemsets")
	# We use the "CUBE" operator, to calculate the frequency of any
	# attribute-value combination.
	#
	# Notice that we replace the NULL values with "N/A" before the CUBE operator.
	# This is to avoid confusion with the way that CUBE uses NULL values to indicate
	# "any value" for attribute combinations that do not use the available attributes
	# of the cube.

	df = (
	dataset
	.fillna("N/A")
	.cube(col("email"), col("phone"), col("mobile"), col("contact_name"), col("license_num"))
	.count()
	.orderBy("count", ascending=False)
	)

	# Calculates the number of attributes in the itemset
	# (effectively number of attributes in the GROUP BY that are not NULL
	df = df.withColumn("itemset_size", num_attributes - sum(df[c].isNull().cast('int') for c in df.columns))

	# Calculates the number of attributes in the itemset that are equal to "N/A"
	df = df.withColumn("missing_values", sum(coalesce(df[c]=='N/A', lit(False)).cast('int') for c in df.columns))

	# The "itemsets" view contains the frequency of every "itemset" in the dataset.
	df.createOrReplaceTempView("itemsets")