Created
August 23, 2021 13:52
-
-
Save zhuchangzhan/5ba9c3caa5fb6180056cd93b2393cea2 to your computer and use it in GitHub Desktop.
TPCX Blog Spark Code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pyspark | |
import pyspark.sql.functions as f | |
from pyspark.sql import SparkSession | |
from pyspark.storagelevel import StorageLevel | |
def tpcx_bb_q26(): | |
spark = SparkSession.builder.appName('ipython').getOrCreate() | |
store_sales = spark.read.parquet('s3://...') | |
store_sales.createOrReplaceTempView('store_sales') | |
item = spark.read.parquet('s3://...') | |
item.createOrReplaceTempView('item') | |
| |
result = spark.sql( | |
""" | |
SELECT | |
ss.ss_customer_sk AS cid, | |
CAST( count(CASE WHEN i.i_class_id=1 THEN 1 ELSE NULL END) AS DOUBLE ) AS id1, | |
CAST( count(CASE WHEN i.i_class_id=2 THEN 1 ELSE NULL END) AS DOUBLE ) AS id2, | |
... | |
CAST( count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS DOUBLE ) AS id15 | |
FROM store_sales ss | |
INNER JOIN item i | |
ON | |
( | |
ss.ss_item_sk = i.i_item_sk | |
AND i.i_category IN ('Books') | |
AND ss.ss_customer_sk IS NOT NULL | |
) | |
GROUP BY ss.ss_customer_sk | |
HAVING count(ss.ss_item_sk) > 5 | |
ORDER BY cid | |
""" | |
) | |
customer_i_class = result.toPandas().values.astype(np.float64).sum() | |
return customer_i_class |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment