angeligareta · June 1, 2021 22:12 · 588chm · Dec 13, 2023
diff --git a/get_dataset_partitions_pd.py b/get_dataset_partitions_pd.py
 def get_dataset_partitions_pd(df, train_split=0.8, val_split=0.1, test_split=0.1):
    assert (train_split + test_split + val_split) == 1
    
    # Only allows for equal validation and test splits
    assert val_split == test_split 

    # Specify seed to always have the same split distribution between runs
    df_sample = df.sample(frac=1, random_state=12)
    indices_or_sections = [int(train_split * len(df)), int((1 - val_split - test_split) * len(df))]
    
    train_ds, val_ds, test_ds = np.split(df_sample, indices_or_sections)
    
    return train_ds, val_ds, test_ds
	def get_dataset_partitions_pd(df, train_split=0.8, val_split=0.1, test_split=0.1):
	assert (train_split + test_split + val_split) == 1

	# Only allows for equal validation and test splits
	assert val_split == test_split

	# Specify seed to always have the same split distribution between runs
	df_sample = df.sample(frac=1, random_state=12)
	indices_or_sections = [int(train_split * len(df)), int((1 - val_split - test_split) * len(df))]

	train_ds, val_ds, test_ds = np.split(df_sample, indices_or_sections)

	return train_ds, val_ds, test_ds