jamiekt · July 18, 2018 07:52
diff --git a/derive_products_tally_in_basket_distribution.py b/derive_products_tally_in_basket_distribution.py
 from pyspark.sql.functions import col, lit
 # df is a Spark DataFrame:  DataFrame[basket: string, product: string, customer: string, store: string]
 baskets_tally = df.groupBy().agg(countDistinct(col('basket'))).collect()[0][0]
 df = df.groupBy(col('basket')).count().withColumnRenamed('count', 'tally_of_products_per_basket')
 df = df.groupBy("tally_of_products_per_basket") \
    .count() \
    .withColumnRenamed('count', 'tally_of_baskets_containing_products_tally') \
    .orderBy(col("tally_of_products_per_basket").asc())
 df = df.withColumn(
    'fraction_of_baskets_containing_products_tally', 
    col('tally_of_baskets_containing_products_tally') / lit(baskets_tally)
 )
 """
 To illustrate the calculated data:

 [(row[0], row[2]) for row in df \
    .orderBy(col('tally_of_products_per_basket')) \
    .limit(10) \
    .collect()]

 returns this histogram (x, y):

 [(1, 0.1324264771159618),
 (2, 0.1370917625841512),
 (3, 0.11921614825173989),
 (4, 0.09889617171282909),
 (5, 0.08101257741810314),
 (6, 0.06629952190050782),
 (7, 0.054068733410406467),
 (8, 0.044480808689380966),
 (9, 0.03708986734310147),
 (10, 0.030869985466493634)]

 where y is the probability of a basket containing x products
 """
	from pyspark.sql.functions import col, lit
	# df is a Spark DataFrame: DataFrame[basket: string, product: string, customer: string, store: string]
	baskets_tally = df.groupBy().agg(countDistinct(col('basket'))).collect()[0][0]
	df = df.groupBy(col('basket')).count().withColumnRenamed('count', 'tally_of_products_per_basket')
	df = df.groupBy("tally_of_products_per_basket") \
	.count() \
	.withColumnRenamed('count', 'tally_of_baskets_containing_products_tally') \
	.orderBy(col("tally_of_products_per_basket").asc())
	df = df.withColumn(
	'fraction_of_baskets_containing_products_tally',
	col('tally_of_baskets_containing_products_tally') / lit(baskets_tally)
	)
	"""
	To illustrate the calculated data:

	[(row[0], row[2]) for row in df \
	.orderBy(col('tally_of_products_per_basket')) \
	.limit(10) \
	.collect()]

	returns this histogram (x, y):

	[(1, 0.1324264771159618),
	(2, 0.1370917625841512),
	(3, 0.11921614825173989),
	(4, 0.09889617171282909),
	(5, 0.08101257741810314),
	(6, 0.06629952190050782),
	(7, 0.054068733410406467),
	(8, 0.044480808689380966),
	(9, 0.03708986734310147),
	(10, 0.030869985466493634)]

	where y is the probability of a basket containing x products
	"""