jeremy-rutman · June 24, 2020 10:21
diff --git a/pandas_tricks.py b/pandas_tricks.py
 # selecting all cols except one
 df = pd.DataFrame({'a':[1,2,3,4],'b':[1,2,3,4]})
 df2 = df.loc[:,df.columns!='b']
 print(df)
 print(df2)

 # split df into train, val, test  with val from 0.9 to 0.95 and test from 0.95 to 1.0 of randomized data
 train, validate, test = np.split(df.sample(frac=1), [int(.9*len(df)), int(.95*len(df))])

 # FILTERING
 #filtering groups and accesing group info 
 g = df_pd.groupby('time_bucket')
 sizes = np.array([len(group) for name,group in g])
 # take groups that are smaller than n_rows, otherwise sample
 df_out = g.apply(lambda x: x if (len(x)<=n_rows) else x.sample(frac = 0.1 ))   

 #OR
 # take groups that are smaller than n_rows
 df_out = g.filter(lambda x: len(x) < n_rows)



 # describe one liner 
 df.mycol.str.split().apply(lambda x:len(x)).describe()
	# selecting all cols except one
	df = pd.DataFrame({'a':[1,2,3,4],'b':[1,2,3,4]})
	df2 = df.loc[:,df.columns!='b']
	print(df)
	print(df2)

	# split df into train, val, test with val from 0.9 to 0.95 and test from 0.95 to 1.0 of randomized data
	train, validate, test = np.split(df.sample(frac=1), [int(.9len(df)), int(.95len(df))])

	# FILTERING
	#filtering groups and accesing group info
	g = df_pd.groupby('time_bucket')
	sizes = np.array([len(group) for name,group in g])
	# take groups that are smaller than n_rows, otherwise sample
	df_out = g.apply(lambda x: x if (len(x)<=n_rows) else x.sample(frac = 0.1 ))

	#OR
	# take groups that are smaller than n_rows
	df_out = g.filter(lambda x: len(x) < n_rows)



	# describe one liner
	df.mycol.str.split().apply(lambda x:len(x)).describe()
No results found