mkaranasou · October 17, 2019 17:22
diff --git a/example_null_column_returned_from_udf.py b/example_null_column_returned_from_udf.py
 from pyspark import SparkConf
 from pyspark.sql import SparkSession, functions as F, types as T


 conf = SparkConf()
 spark_session = SparkSession.builder \
    .config(conf=conf) \
    .appName('test') \
    .getOrCreate()

 # create a dataframe
 data = [{'a': 1, 'b': 0}, {'a': 10, 'b': 3}]
 df = spark_session.createDataFrame(data)
 df.show()

 # +---+---+
 # |  a|  b|
 # +---+---+
 # |  1|  0|
 # | 10|  3|
 # +---+---+


 # define a simple function that returns a / b
 def calculate_a_b_ratio(a, b):
    if b > 0:
        return a / b
    return 0.

 # and a udf for this function - notice the return datatype
 udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.BooleanType())


 # let's use the udf to add a column to the dataframe
 # even though the return type is defined wrong as T.BooleanType(), pyspark won't 
 # complain, but it will give you null results
 df = df.withColumn('a_b_ratio', udf_ratio_calculation('a', 'b'))
 df.show()

 # +---+---+---------+
 # |  a|  b|a_b_ratio|
 # +---+---+---------+
 # |  1|  0|     null|
 # | 10|  3|     null|
 # +---+---+---------+

 # the same would be if we were defining the return type as T.DecimalType()
 udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.DecimalType())
 df = df.withColumn('a_b_ratio_dec', udf_ratio_calculation('a', 'b'))
 df.show()

 # +---+---+---------+-------------+
 # |  a|  b|a_b_ratio|a_b_ratio_dec|
 # +---+---+---------+-------------+
 # |  1|  0|     null|         null|
 # | 10|  3|     null|         null|
 # +---+---+---------+-------------+

 # of course, the correct type here is T.FloatType(), and the udf works as expected
 udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.FloatType())
 df = df.withColumn('a_b_ratio_float', udf_ratio_calculation('a', 'b'))
 df.show()

 # +---+---+---------+-------------+---------------+
 # |  a|  b|a_b_ratio|a_b_ratio_dec|a_b_ratio_float|
 # +---+---+---------+-------------+---------------+
 # |  1|  0|     null|         null|            0.0|
 # | 10|  3|     null|         null|      3.3333333|
 # +---+---+---------+-------------+---------------+
	from pyspark import SparkConf
	from pyspark.sql import SparkSession, functions as F, types as T


	conf = SparkConf()
	spark_session = SparkSession.builder \
	.config(conf=conf) \
	.appName('test') \
	.getOrCreate()

	# create a dataframe
	data = [{'a': 1, 'b': 0}, {'a': 10, 'b': 3}]
	df = spark_session.createDataFrame(data)
	df.show()

	# +---+---+
	# \| a\| b\|
	# +---+---+
	# \| 1\| 0\|
	# \| 10\| 3\|
	# +---+---+


	# define a simple function that returns a / b
	def calculate_a_b_ratio(a, b):
	if b > 0:
	return a / b
	return 0.

	# and a udf for this function - notice the return datatype
	udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.BooleanType())


	# let's use the udf to add a column to the dataframe
	# even though the return type is defined wrong as T.BooleanType(), pyspark won't
	# complain, but it will give you null results
	df = df.withColumn('a_b_ratio', udf_ratio_calculation('a', 'b'))
	df.show()

	# +---+---+---------+
	# \| a\| b\|a_b_ratio\|
	# +---+---+---------+
	# \| 1\| 0\| null\|
	# \| 10\| 3\| null\|
	# +---+---+---------+

	# the same would be if we were defining the return type as T.DecimalType()
	udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.DecimalType())
	df = df.withColumn('a_b_ratio_dec', udf_ratio_calculation('a', 'b'))
	df.show()

	# +---+---+---------+-------------+
	# \| a\| b\|a_b_ratio\|a_b_ratio_dec\|
	# +---+---+---------+-------------+
	# \| 1\| 0\| null\| null\|
	# \| 10\| 3\| null\| null\|
	# +---+---+---------+-------------+

	# of course, the correct type here is T.FloatType(), and the udf works as expected
	udf_ratio_calculation = F.udf(calculate_a_b_ratio, T.FloatType())
	df = df.withColumn('a_b_ratio_float', udf_ratio_calculation('a', 'b'))
	df.show()

	# +---+---+---------+-------------+---------------+
	# \| a\| b\|a_b_ratio\|a_b_ratio_dec\|a_b_ratio_float\|
	# +---+---+---------+-------------+---------------+
	# \| 1\| 0\| null\| null\| 0.0\|
	# \| 10\| 3\| null\| null\| 3.3333333\|
	# +---+---+---------+-------------+---------------+
No results found