Created
March 20, 2022 19:46
-
-
Save paretech/63e36af4c249d49d1573a6c057d9e2a3 to your computer and use it in GitHub Desktop.
Shuffle multiple blocks of tabular data by unique value.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% [markdown] | |
# # Data Generation and Sequencing | |
# %% | |
import pandas as pd | |
import numpy as np | |
import random | |
# %% | |
primary = pd.DataFrame({'primary': ['A', 'B', 'C']}) | |
secondary = pd.DataFrame({'secondary': [1, 2, 3]}) | |
other = pd.DataFrame({'other': [1, 2, 3, 4]}) | |
df = pd.DataFrame(primary).merge(secondary, how='cross').merge(other, how='cross') | |
df['more_data'] = 0 | |
# %% | |
df | |
# %% | |
def shuffle_by_unique_values(df, key): | |
index = df[key].unique() | |
random.shuffle(index) | |
return df.set_index(key).loc[index].reset_index() | |
def shuffle_blocks(df, columns): | |
while len(columns) > 1: | |
shuffle_key = columns.pop() | |
df = df.groupby(columns, as_index=False).apply(shuffle_by_unique_values, shuffle_key).reset_index(drop=True) | |
return shuffle_by_unique_values(df.reset_index(drop=True), 'primary') | |
# %% | |
shuffle_blocks(df, ['primary', 'secondary', 'other']) | |
# %% | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment