Skip to content

Instantly share code, notes, and snippets.

@michaelchughes
Created September 18, 2019 22:51
Show Gist options
  • Save michaelchughes/e550d509624eae1305de7cdeb3dd8382 to your computer and use it in GitHub Desktop.
Save michaelchughes/e550d509624eae1305de7cdeb3dd8382 to your computer and use it in GitHub Desktop.
Collapsing DataFrame by subject
import pandas as pd
import numpy as np
import time
prng = np.random.RandomState(0)
n_subj = 20000
n_rows = 1000000
# Randomly assign to one of 20000 subjects
subj_N = np.sort(prng.randint(low=0, high=n_subj, size=n_rows))
# Create 3 random columns of data
x_ND = prng.randn(n_rows, 3)
# Combine into one big dataframe
df = pd.DataFrame( np.hstack([
subj_N[:,np.newaxis],
x_ND]),
columns=['subj', 'x1', 'x2', 'x3'])
# Print out first few lines
df.head()
df.tail()
# Compute "fenceposts" that delineate start/stop of each subject
fp = np.hstack([0, 1+np.flatnonzero(np.diff(subj_N)), subj_N.size])
assert fp.size == (n_subj+1)
start_time = time.time()
mean_of_x1 = np.zeros(n_subj)
for p in range(n_subj):
start = fp[p]
stop = fp[p+1]
mean_of_x1[p] = np.mean(df['x1'].values[start:stop], axis=0)
elapsed_time_sec = time.time() - start_time
print("Computed the per-subj mean of %d subjects and %d rows in %.2f sec" % (n_subj, n_rows, elapsed_time_sec))
## Double check that a few random subjects were computed correctly
for subj in [0, 333, 2345, 16789, subj_N.max()]:
row_mask = df['subj'] == subj
per_subj_mean = np.mean(df['x1'].values[row_mask])
print(per_subj_mean)
print(mean_of_x1[subj])
assert np.allclose(per_subj_mean, mean_of_x1[subj])
print("Check for subj %d PASSED" % subj)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment