jamiekt · July 17, 2019 11:26
diff --git a/README.md b/README.md
diff --git a/derivecounts.py b/derivecounts.py
 from pyspark.sql.functions import countDistinct, col
 data = sqlContext.createDataFrame([
    ('001', 'bananas', 'John Doe', 'Stratford'),
    ('001', 'apples', 'John Doe', 'Stratford'),
    ('002', 'apples', 'Jane Doe', 'Aberdeen'),
    ('002', 'baked beans', 'Jane Doe', 'Aberdeen'),
    ('002', 'cornflakes', 'Jane Doe', 'Aberdeen'),
    ('003', 'chocolate', 'John Doe', 'Stratford')
 ], ['basket', 'product', 'customer', 'store'])
 data.groupBy().agg(
  countDistinct(col('basket')), 
  countDistinct(col('product')), 
  countDistinct(col('customer')), 
  countDistinct(col('store'))
 ).toPandas()
	from pyspark.sql.functions import countDistinct, col
	data = sqlContext.createDataFrame([
	('001', 'bananas', 'John Doe', 'Stratford'),
	('001', 'apples', 'John Doe', 'Stratford'),
	('002', 'apples', 'Jane Doe', 'Aberdeen'),
	('002', 'baked beans', 'Jane Doe', 'Aberdeen'),
	('002', 'cornflakes', 'Jane Doe', 'Aberdeen'),
	('003', 'chocolate', 'John Doe', 'Stratford')
	], ['basket', 'product', 'customer', 'store'])
	data.groupBy().agg(
	countDistinct(col('basket')),
	countDistinct(col('product')),
	countDistinct(col('customer')),
	countDistinct(col('store'))
	).toPandas()