Last active
March 28, 2025 15:53
-
-
Save Priyansh121096/471a52d6626763d944351917bccce3b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
from pyspark.sql.functions import rand, expr, from_unixtime | |
import time | |
# Initialize Spark session | |
spark = SparkSession.builder.appName("RandomDataFrame").getOrCreate() | |
# Number of rows | |
n_rows = 1_000_000 | |
# Create DataFrame with random data | |
df = spark.range(n_rows) \ | |
.withColumn("random_string", expr("substring(md5(cast(rand() as string)), 1, 10)")) \ | |
.withColumn("random_int", (rand() * 1000).cast("int")) \ | |
.withColumn("random_bool", (rand() > 0.5)) \ | |
.withColumn("random_datetime", from_unixtime(expr("cast(rand() * (unix_timestamp('2025-12-31') - unix_timestamp('2000-01-01')) + unix_timestamp('2000-01-01') as bigint)"))) \ | |
.drop("id") | |
# Benchmark different methods to check if DataFrame is empty | |
def check_empty_count(df): | |
return df.count() == 0 | |
def check_empty_head(df): | |
return df.head(1) == [] | |
def check_empty_take(df): | |
return len(df.take(1)) == 0 | |
def check_empty_rdd(df): | |
return df.rdd.isEmpty() | |
methods = { | |
'count': check_empty_count, | |
'head': check_empty_head, | |
'take': check_empty_take, | |
'rdd_isEmpty': check_empty_rdd | |
} | |
results = {} | |
for name, method in methods.items(): | |
start_time = time.time() | |
is_empty = method(df) | |
elapsed_time = time.time() - start_time | |
results[name] = elapsed_time | |
print(f"Method '{name}' took {elapsed_time:.4f} seconds, empty: {is_empty}") | |
fastest_method = min(results, key=results.get) | |
print(f"Fastest method: '{fastest_method}' with execution time: {results[fastest_method]:.4f} seconds") | |
######### 0 rows ####### | |
# 25/03/28 15:53:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
# Setting default log level to "WARN". | |
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
# Method 'count' took 1.6409 seconds, empty: True | |
# Method 'head' took 0.1378 seconds, empty: True | |
# Method 'take' took 0.0390 seconds, empty: True | |
# Method 'rdd_isEmpty' took 0.0468 seconds, empty: True | |
# Fastest method: 'take' with execution time: 0.0390 seconds | |
######### 1M rows ######## | |
# 25/03/28 15:39:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
# Setting default log level to "WARN". | |
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
# Method 'count' took 2.0601 seconds, empty: False | |
# Method 'head' took 0.2184 seconds, empty: False | |
# Method 'take' took 0.0611 seconds, empty: False | |
# Method 'rdd_isEmpty' took 0.6267 seconds, empty: False | |
# Fastest method: 'take' with execution time: 0.0611 seconds | |
######### 10M rows ####### | |
# 25/03/28 15:46:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
# Setting default log level to "WARN". | |
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
# Method 'count' took 2.1280 seconds, empty: False | |
# Method 'head' took 0.2165 seconds, empty: False | |
# Method 'take' took 0.0657 seconds, empty: False | |
# Method 'rdd_isEmpty' took 0.6414 seconds, empty: False | |
# Fastest method: 'take' with execution time: 0.0657 seconds | |
######### 100M rows ####### | |
# 25/03/28 15:48:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable | |
# Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
# Setting default log level to "WARN". | |
# To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). | |
# Method 'count' took 2.1992 seconds, empty: False | |
# Method 'head' took 0.2249 seconds, empty: False | |
# Method 'take' took 0.0672 seconds, empty: False | |
# Method 'rdd_isEmpty' took 0.6123 seconds, empty: False | |
# Fastest method: 'take' with execution time: 0.0672 seconds |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment