Priyansh121096 · March 28, 2025 15:53
diff --git a/pyspark_fastest_way_to_check_if_df_empty.py b/pyspark_fastest_way_to_check_if_df_empty.py
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import rand, expr, from_unixtime
 import time

 # Initialize Spark session
 spark = SparkSession.builder.appName("RandomDataFrame").getOrCreate()

 # Number of rows
 n_rows = 1_000_000

 # Create DataFrame with random data
 df = spark.range(n_rows) \
    .withColumn("random_string", expr("substring(md5(cast(rand() as string)), 1, 10)")) \
    .withColumn("random_int", (rand() * 1000).cast("int")) \
    .withColumn("random_bool", (rand() > 0.5)) \
    .withColumn("random_datetime", from_unixtime(expr("cast(rand() * (unix_timestamp('2025-12-31') - unix_timestamp('2000-01-01')) + unix_timestamp('2000-01-01') as bigint)"))) \
    .drop("id")

 # Benchmark different methods to check if DataFrame is empty
 def check_empty_count(df):
    return df.count() == 0

 def check_empty_head(df):
    return df.head(1) == []

 def check_empty_take(df):
    return len(df.take(1)) == 0

 def check_empty_rdd(df):
    return df.rdd.isEmpty()

 methods = {
    'count': check_empty_count,
    'head': check_empty_head,
    'take': check_empty_take,
    'rdd_isEmpty': check_empty_rdd
 }

 results = {}

 for name, method in methods.items():
    start_time = time.time()
    is_empty = method(df)
    elapsed_time = time.time() - start_time
    results[name] = elapsed_time
    print(f"Method '{name}' took {elapsed_time:.4f} seconds, empty: {is_empty}")

 fastest_method = min(results, key=results.get)
 print(f"Fastest method: '{fastest_method}' with execution time: {results[fastest_method]:.4f} seconds")

 ######### 0 rows #######
 # 25/03/28 15:53:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 # Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 # Setting default log level to "WARN".
 # To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 # Method 'count' took 1.6409 seconds, empty: True                                 
 # Method 'head' took 0.1378 seconds, empty: True
 # Method 'take' took 0.0390 seconds, empty: True
 # Method 'rdd_isEmpty' took 0.0468 seconds, empty: True
 # Fastest method: 'take' with execution time: 0.0390 seconds

 ######### 1M rows ########
 # 25/03/28 15:39:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 # Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 # Setting default log level to "WARN".
 # To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 # Method 'count' took 2.0601 seconds, empty: False                                
 # Method 'head' took 0.2184 seconds, empty: False
 # Method 'take' took 0.0611 seconds, empty: False
 # Method 'rdd_isEmpty' took 0.6267 seconds, empty: False
 # Fastest method: 'take' with execution time: 0.0611 seconds

 ######### 10M rows #######
 # 25/03/28 15:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 # Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 # Setting default log level to "WARN".
 # To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 # Method 'count' took 2.1280 seconds, empty: False                                
 # Method 'head' took 0.2165 seconds, empty: False
 # Method 'take' took 0.0657 seconds, empty: False
 # Method 'rdd_isEmpty' took 0.6414 seconds, empty: False
 # Fastest method: 'take' with execution time: 0.0657 seconds
        
 ######### 100M rows #######
 # 25/03/28 15:48:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
 # Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 # Setting default log level to "WARN".
 # To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
 # Method 'count' took 2.1992 seconds, empty: False                                
 # Method 'head' took 0.2249 seconds, empty: False
 # Method 'take' took 0.0672 seconds, empty: False
 # Method 'rdd_isEmpty' took 0.6123 seconds, empty: False
 # Fastest method: 'take' with execution time: 0.0672 seconds
	from pyspark.sql import SparkSession
	from pyspark.sql.functions import rand, expr, from_unixtime
	import time

	# Initialize Spark session
	spark = SparkSession.builder.appName("RandomDataFrame").getOrCreate()

	# Number of rows
	n_rows = 1_000_000

	# Create DataFrame with random data
	df = spark.range(n_rows) \
	.withColumn("random_string", expr("substring(md5(cast(rand() as string)), 1, 10)")) \
	.withColumn("random_int", (rand() * 1000).cast("int")) \
	.withColumn("random_bool", (rand() > 0.5)) \
	.withColumn("random_datetime", from_unixtime(expr("cast(rand() * (unix_timestamp('2025-12-31') - unix_timestamp('2000-01-01')) + unix_timestamp('2000-01-01') as bigint)"))) \
	.drop("id")

	# Benchmark different methods to check if DataFrame is empty
	def check_empty_count(df):
	return df.count() == 0

	def check_empty_head(df):
	return df.head(1) == []

	def check_empty_take(df):
	return len(df.take(1)) == 0

	def check_empty_rdd(df):
	return df.rdd.isEmpty()

	methods = {
	'count': check_empty_count,
	'head': check_empty_head,
	'take': check_empty_take,
	'rdd_isEmpty': check_empty_rdd
	}

	results = {}

	for name, method in methods.items():
	start_time = time.time()
	is_empty = method(df)
	elapsed_time = time.time() - start_time
	results[name] = elapsed_time
	print(f"Method '{name}' took {elapsed_time:.4f} seconds, empty: {is_empty}")

	fastest_method = min(results, key=results.get)
	print(f"Fastest method: '{fastest_method}' with execution time: {results[fastest_method]:.4f} seconds")

	######### 0 rows #######
	# 25/03/28 15:53:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
	# Setting default log level to "WARN".
	# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
	# Method 'count' took 1.6409 seconds, empty: True
	# Method 'head' took 0.1378 seconds, empty: True
	# Method 'take' took 0.0390 seconds, empty: True
	# Method 'rdd_isEmpty' took 0.0468 seconds, empty: True
	# Fastest method: 'take' with execution time: 0.0390 seconds

	######### 1M rows ########
	# 25/03/28 15:39:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
	# Setting default log level to "WARN".
	# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
	# Method 'count' took 2.0601 seconds, empty: False
	# Method 'head' took 0.2184 seconds, empty: False
	# Method 'take' took 0.0611 seconds, empty: False
	# Method 'rdd_isEmpty' took 0.6267 seconds, empty: False
	# Fastest method: 'take' with execution time: 0.0611 seconds

	######### 10M rows #######
	# 25/03/28 15:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
	# Setting default log level to "WARN".
	# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
	# Method 'count' took 2.1280 seconds, empty: False
	# Method 'head' took 0.2165 seconds, empty: False
	# Method 'take' took 0.0657 seconds, empty: False
	# Method 'rdd_isEmpty' took 0.6414 seconds, empty: False
	# Fastest method: 'take' with execution time: 0.0657 seconds

	######### 100M rows #######
	# 25/03/28 15:48:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
	# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
	# Setting default log level to "WARN".
	# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
	# Method 'count' took 2.1992 seconds, empty: False
	# Method 'head' took 0.2249 seconds, empty: False
	# Method 'take' took 0.0672 seconds, empty: False
	# Method 'rdd_isEmpty' took 0.6123 seconds, empty: False
	# Fastest method: 'take' with execution time: 0.0672 seconds