Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Priyansh121096/471a52d6626763d944351917bccce3b5 to your computer and use it in GitHub Desktop.
Save Priyansh121096/471a52d6626763d944351917bccce3b5 to your computer and use it in GitHub Desktop.
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand, expr, from_unixtime
import time
# Initialize Spark session
spark = SparkSession.builder.appName("RandomDataFrame").getOrCreate()
# Number of rows
n_rows = 1_000_000
# Create DataFrame with random data
df = spark.range(n_rows) \
.withColumn("random_string", expr("substring(md5(cast(rand() as string)), 1, 10)")) \
.withColumn("random_int", (rand() * 1000).cast("int")) \
.withColumn("random_bool", (rand() > 0.5)) \
.withColumn("random_datetime", from_unixtime(expr("cast(rand() * (unix_timestamp('2025-12-31') - unix_timestamp('2000-01-01')) + unix_timestamp('2000-01-01') as bigint)"))) \
.drop("id")
# Benchmark different methods to check if DataFrame is empty
def check_empty_count(df):
return df.count() == 0
def check_empty_head(df):
return df.head(1) == []
def check_empty_take(df):
return len(df.take(1)) == 0
def check_empty_rdd(df):
return df.rdd.isEmpty()
methods = {
'count': check_empty_count,
'head': check_empty_head,
'take': check_empty_take,
'rdd_isEmpty': check_empty_rdd
}
results = {}
for name, method in methods.items():
start_time = time.time()
is_empty = method(df)
elapsed_time = time.time() - start_time
results[name] = elapsed_time
print(f"Method '{name}' took {elapsed_time:.4f} seconds, empty: {is_empty}")
fastest_method = min(results, key=results.get)
print(f"Fastest method: '{fastest_method}' with execution time: {results[fastest_method]:.4f} seconds")
######### 0 rows #######
# 25/03/28 15:53:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
# Setting default log level to "WARN".
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
# Method 'count' took 1.6409 seconds, empty: True
# Method 'head' took 0.1378 seconds, empty: True
# Method 'take' took 0.0390 seconds, empty: True
# Method 'rdd_isEmpty' took 0.0468 seconds, empty: True
# Fastest method: 'take' with execution time: 0.0390 seconds
######### 1M rows ########
# 25/03/28 15:39:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
# Setting default log level to "WARN".
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
# Method 'count' took 2.0601 seconds, empty: False
# Method 'head' took 0.2184 seconds, empty: False
# Method 'take' took 0.0611 seconds, empty: False
# Method 'rdd_isEmpty' took 0.6267 seconds, empty: False
# Fastest method: 'take' with execution time: 0.0611 seconds
######### 10M rows #######
# 25/03/28 15:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
# Setting default log level to "WARN".
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
# Method 'count' took 2.1280 seconds, empty: False
# Method 'head' took 0.2165 seconds, empty: False
# Method 'take' took 0.0657 seconds, empty: False
# Method 'rdd_isEmpty' took 0.6414 seconds, empty: False
# Fastest method: 'take' with execution time: 0.0657 seconds
######### 100M rows #######
# 25/03/28 15:48:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
# Setting default log level to "WARN".
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
# Method 'count' took 2.1992 seconds, empty: False
# Method 'head' took 0.2249 seconds, empty: False
# Method 'take' took 0.0672 seconds, empty: False
# Method 'rdd_isEmpty' took 0.6123 seconds, empty: False
# Fastest method: 'take' with execution time: 0.0672 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment