Skip to content

Instantly share code, notes, and snippets.

@aialenti
Last active September 13, 2020 14:45
Show Gist options
  • Save aialenti/a9d396afd5979bad0883cfbd86722ae6 to your computer and use it in GitHub Desktop.
Save aialenti/a9d396afd5979bad0883cfbd86722ae6 to your computer and use it in GitHub Desktop.
# Read the source tables in Parquet format
sales_table = spark.read.parquet("./data/sales_parquet")
'''
-- Create a temporary table with a few renamings
CREATE TABLE temp_1 AS
SELECT seller_id AS the_seller,
num_pieces_sold AS pieces,
product_id
FROM sales_table;
-- Do aggregation on the new table
CREATE TABLE temp_2 AS
SELECT product_id,
SUM(pieces) AS total_pieces
FROM temp_1
GROUP BY product_id;
-- Add a further column
SELECT a.*,
1 AS fake_column
FROM temp2 a;
'''
sales_table_execution_plan = sales_table. \
withColumnRenamed("seller_id", "the_seller"). \
withColumnRenamed("num_pieces_sold", "pieces").\
groupBy(
col("product_id")
).agg(
sum("pieces").alias("total_pieces")
).withColumn("fake_column", lit(1))
# Print Schema
sales_table_execution_plan.printSchema()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment