mattf · March 24, 2021 23:04
diff --git a/data.py b/data.py
 from pyspark.sql.types import *
 from pyspark.sql.functions import mean

 with open("data.csv", "w") as fp:
    fp.write("""
 a,
 b,1
 c, 
 d,1
 """)

 # data is a CSV file w/ two fields, a string and a double
 schema = StructType([StructField("a", StringType()), StructField("b", DoubleType(), True)])

 df = spark.read.format("csv").schema(schema).load("data.csv")

 # the data has some nulls, filter them out
 df = df.na.drop()

 df.select(mean(df['b'])).explain()
 #== Physical Plan ==
 #*(2) HashAggregate(keys=[], functions=[avg(b#1)])
 #+- Exchange SinglePartition, true, [id=#142]
 #   +- *(1) HashAggregate(keys=[], functions=[partial_avg(b#1)])
 #      +- GpuColumnarToRow false
 #         +- GpuProject [b#1]
 #            +- GpuCoalesceBatches TargetSize(2147483647)
 #               +- GpuFilter GpuAtLeastNNulls(n, a#0,b#1)
 #                  +- GpuFileGpuScan csv [a#0,b#1] Batched: true, DataFilters: [AtLeastNNulls(n, a#0,b#1)], Format: CSV, Location: InMemoryFileIndex[file:/tmp/spark-3808f7c3-eb64-4ecc-b699-1ae8f0266d04/userFiles-29db7dd7-d72d-47..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<a:string,b:double>

 df.select(mean(df['b'])).show()
 #+------------------+
 #|            avg(b)|
 #+------------------+
 #|0.6666666666666666|
 #+------------------+
	from pyspark.sql.types import *
	from pyspark.sql.functions import mean

	with open("data.csv", "w") as fp:
	fp.write("""
	a,
	b,1
	c,
	d,1
	""")

	# data is a CSV file w/ two fields, a string and a double
	schema = StructType([StructField("a", StringType()), StructField("b", DoubleType(), True)])

	df = spark.read.format("csv").schema(schema).load("data.csv")

	# the data has some nulls, filter them out
	df = df.na.drop()

	df.select(mean(df['b'])).explain()
	#== Physical Plan ==
	#*(2) HashAggregate(keys=[], functions=[avg(b#1)])
	#+- Exchange SinglePartition, true, [id=#142]
	# +- *(1) HashAggregate(keys=[], functions=[partial_avg(b#1)])
	# +- GpuColumnarToRow false
	# +- GpuProject [b#1]
	# +- GpuCoalesceBatches TargetSize(2147483647)
	# +- GpuFilter GpuAtLeastNNulls(n, a#0,b#1)
	# +- GpuFileGpuScan csv [a#0,b#1] Batched: true, DataFilters: [AtLeastNNulls(n, a#0,b#1)], Format: CSV, Location: InMemoryFileIndex[file:/tmp/spark-3808f7c3-eb64-4ecc-b699-1ae8f0266d04/userFiles-29db7dd7-d72d-47..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<a:string,b:double>

	df.select(mean(df['b'])).show()
	#+------------------+
	#\| avg(b)\|
	#+------------------+
	#\|0.6666666666666666\|
	#+------------------+