Last active
July 10, 2023 03:45
-
-
Save morganmcg1/15a9de711b9c5e8e1bd142b4be80252d to your computer and use it in GitHub Desktop.
PySpark - Normalize (Standardize) train and test dataframes: [ (x - mean) / std_dev ]
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on the solution here: https://stackoverflow.com/questions/44580644/subtract-mean-from-pyspark-dataframe | |
# Function to normalise (standardise) PySpark dataframes | |
def standardize_train_test_data(train_df, test_df, columns): | |
''' | |
Add normalised columns to the input dataframe. | |
formula = [(X - mean) / std_dev] | |
Inputs : training dataframe, list of column name strings to be normalised | |
Returns : dataframe with new normalised columns, averages and std deviation dataframes | |
''' | |
# Find the Mean and the Standard Deviation for each column | |
aggExpr = [] | |
aggStd = [] | |
for column in columns: | |
aggExpr.append(mean(train_df[column]).alias(column)) | |
aggStd.append(stddev(train_df[column]).alias(column + '_stddev')) | |
averages = train_df.agg(*aggExpr).collect()[0] | |
std_devs = train_df.agg(*aggStd).collect()[0] | |
# Standardise each dataframe, column by column | |
for column in columns: | |
# Standardise the TRAINING data | |
train_df = train_df.withColumn(column + '_norm', ((train_df[column] - averages[column]) / | |
std_devs[column + '_stddev'])) | |
# Standardise the TEST data (using the training mean and std_dev) | |
test_df = test_df.withColumn(column + '_norm', ((test_df[column] - averages[column]) / | |
std_devs[column + '_stddev'])) | |
return train_df, test_df, averages, std_devs | |
# Original StackExchange function: | |
# def normalize(df, columns): | |
# aggExpr = [] | |
# for column in columns: | |
# aggExpr.append(mean(df[column]).alias(column)) | |
# averages = df.agg(*aggExpr).collect()[0] | |
# selectExpr = [] | |
# for column in columns: | |
# selectExpr.append(df[column] - averages[column]) | |
# return df.select(selectExpr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
+1