aialenti · September 13, 2020 14:45
diff --git a/printSchema.py b/printSchema.py
 #   Read the source tables in Parquet format
 sales_table = spark.read.parquet("./data/sales_parquet")

 '''
 -- Create a temporary table with a few renamings
 CREATE TABLE temp_1 AS
 SELECT seller_id AS the_seller,
       num_pieces_sold AS pieces,
       product_id
 FROM sales_table;

 -- Do aggregation on the new table
 CREATE TABLE temp_2 AS
 SELECT product_id,
       SUM(pieces) AS total_pieces
 FROM temp_1
 GROUP BY product_id;

 -- Add a further column
 SELECT a.*,
       1 AS fake_column
 FROM temp2 a;
 '''
 sales_table_execution_plan = sales_table. \
    withColumnRenamed("seller_id", "the_seller"). \
    withColumnRenamed("num_pieces_sold", "pieces").\
 groupBy(
    col("product_id")
 ).agg(
    sum("pieces").alias("total_pieces")
 ).withColumn("fake_column", lit(1))

 #   Print Schema
 sales_table_execution_plan.printSchema()
	# Read the source tables in Parquet format
	sales_table = spark.read.parquet("./data/sales_parquet")

	'''
	-- Create a temporary table with a few renamings
	CREATE TABLE temp_1 AS
	SELECT seller_id AS the_seller,
	num_pieces_sold AS pieces,
	product_id
	FROM sales_table;

	-- Do aggregation on the new table
	CREATE TABLE temp_2 AS
	SELECT product_id,
	SUM(pieces) AS total_pieces
	FROM temp_1
	GROUP BY product_id;

	-- Add a further column
	SELECT a.*,
	1 AS fake_column
	FROM temp2 a;
	'''
	sales_table_execution_plan = sales_table. \
	withColumnRenamed("seller_id", "the_seller"). \
	withColumnRenamed("num_pieces_sold", "pieces").\
	groupBy(
	col("product_id")
	).agg(
	sum("pieces").alias("total_pieces")
	).withColumn("fake_column", lit(1))

	# Print Schema
	sales_table_execution_plan.printSchema()