michaelchughes · September 18, 2019 22:51
diff --git a/speedtest__per_subj_mean.py b/speedtest__per_subj_mean.py
 import pandas as pd
 import numpy as np
 import time

 prng = np.random.RandomState(0)

 n_subj = 20000
 n_rows = 1000000

 # Randomly assign to one of 20000 subjects

 subj_N = np.sort(prng.randint(low=0, high=n_subj, size=n_rows))

 # Create 3 random columns of data
 x_ND = prng.randn(n_rows, 3)

 # Combine into one big dataframe
 df = pd.DataFrame( np.hstack([
 	subj_N[:,np.newaxis],
 	x_ND]),
 	columns=['subj', 'x1', 'x2', 'x3'])

 # Print out first few lines
 df.head()
 df.tail()

 # Compute "fenceposts" that delineate start/stop of each subject
 fp = np.hstack([0, 1+np.flatnonzero(np.diff(subj_N)), subj_N.size])
 assert fp.size == (n_subj+1)

 start_time = time.time()
 mean_of_x1 = np.zeros(n_subj)
 for p in range(n_subj):
 	start = fp[p]
 	stop = fp[p+1]
 	mean_of_x1[p] = np.mean(df['x1'].values[start:stop], axis=0)
 elapsed_time_sec = time.time() - start_time
 print("Computed the per-subj mean of %d subjects and %d rows in %.2f sec" % (n_subj, n_rows, elapsed_time_sec))

 ## Double check that a few random subjects were computed correctly
 for subj in [0, 333, 2345, 16789, subj_N.max()]:
 	row_mask = df['subj'] == subj
 	per_subj_mean = np.mean(df['x1'].values[row_mask])
 	print(per_subj_mean)
 	print(mean_of_x1[subj])
 	assert np.allclose(per_subj_mean, mean_of_x1[subj])
 	print("Check for subj %d PASSED" % subj)
	import pandas as pd
	import numpy as np
	import time

	prng = np.random.RandomState(0)

	n_subj = 20000
	n_rows = 1000000

	# Randomly assign to one of 20000 subjects

	subj_N = np.sort(prng.randint(low=0, high=n_subj, size=n_rows))

	# Create 3 random columns of data
	x_ND = prng.randn(n_rows, 3)

	# Combine into one big dataframe
	df = pd.DataFrame( np.hstack([
	subj_N[:,np.newaxis],
	x_ND]),
	columns=['subj', 'x1', 'x2', 'x3'])

	# Print out first few lines
	df.head()
	df.tail()

	# Compute "fenceposts" that delineate start/stop of each subject
	fp = np.hstack([0, 1+np.flatnonzero(np.diff(subj_N)), subj_N.size])
	assert fp.size == (n_subj+1)

	start_time = time.time()
	mean_of_x1 = np.zeros(n_subj)
	for p in range(n_subj):
	start = fp[p]
	stop = fp[p+1]
	mean_of_x1[p] = np.mean(df['x1'].values[start:stop], axis=0)
	elapsed_time_sec = time.time() - start_time
	print("Computed the per-subj mean of %d subjects and %d rows in %.2f sec" % (n_subj, n_rows, elapsed_time_sec))

	## Double check that a few random subjects were computed correctly
	for subj in [0, 333, 2345, 16789, subj_N.max()]:
	row_mask = df['subj'] == subj
	per_subj_mean = np.mean(df['x1'].values[row_mask])
	print(per_subj_mean)
	print(mean_of_x1[subj])
	assert np.allclose(per_subj_mean, mean_of_x1[subj])
	print("Check for subj %d PASSED" % subj)