fabsta · July 21, 2016 07:57
diff --git a/Python datascience explore (view_subset_selecting).py b/Python datascience explore (view_subset_selecting).py


 ### single
 Y = data['TV'] # column
 Y = data.TV

 df[:2] # first two rows
 df.ix['Maricopa'] # view a row
 df.ix[:, 'coverage'] # view a column
 df.ix['Yuma', 'coverage'] # view the value based on a row and column

 ## select rows
 # by index
 df.iloc[:2] # rows by row number
 df.iloc[1:2] # Select the second and third row
 df.iloc[2:] # Select every row after the third row
 df.iloc[3:6,0:3] #  
 # by label
 df.loc[:'Arizona'] # all rows by index label
 df.ix[['Arizona', 'Texas']]   # .ix is the combination of both .loc and .iloc. Integers are first considered labels,if not found, falls back on pos indexing

 # conditional
 df.query('A > C')
 df.query('A > 0') 
 df.query('A > 0 & A < 1')
 df.query('A > B | A > C')
 df[df['coverage'] > 50] # all rows where coverage is more than 50
 df[(df['deaths'] > 500) | (df['deaths'] < 50)]
 df[(df['score'] > 1) & (df['score'] < 5)]
 df[~(df['regiment'] == 'Dragoons')] # Select all the regiments not named "Dragoons"

 df[df['age'].notnull() & df['sex'].notnull()]  # ignore the missing data points

 # is in
 df[df.name.isin(value_list)]   # value_list = ['Tina', 'Molly', 'Jason']
 df[~df.name.isin(value_list)]

 # partial matching
 df2[df2.E.str.contains("tw|ou")]
 # regex
 df['raw'].str.contains('....-..-..', regex=True)  # regex
 # where cells are arrays
 df[df['country'].map(lambda country: 'Syria' in country)]
 # random
 df.take(np.random.permutation(len(df))[:2])

 # select cells
 df.ix['Arizona', 2]  # Select the third cell in the row named Arizona
 df.ix[2, 'deaths']  # Select the third cell down in the column named deaths


 # select cell ranges
 df['x3'][:1]  # take column 'x3', 

 # select columns
 df.iloc[:,:2] # Select the first 2 columns
 feature_cols = ['TV','Radio','Newspaper']
 x = data[feature_cols]
 data[['TV','Radio','Newspaper']]
 # column labels
 df.loc[:,['A','B']]  # syntax is: df.loc[rows_index, cols_index]
 # conditional
 df.filter(like='data')


 df['preTestScore'].where(df['postTestScore'] > 50) # Find where a value exists in a column



 ### sorting

 df.sort_values(by='reports', ascending=0)  # Sort the dataframe's rows by reports, in descending order
 df.sort_values(by=['coverage', 'reports'])  # Sort the dataframe's rows by coverage and then by reports, in ascending order



 # Exploring

 ### general


 data.head()
 data.tail()
 data.tail().transpose()

 # dimensions
 data.shape()



 #List unique values in the df['name'] column
 df.name.unique()




 ####
 # Stats
 ####

 # Descriptive statistics by group
 data['preTestScore'].groupby(df['company']).describe()
 data['preTestScore'].describe()
 # Count the number of non-NA values
 data['preTestScore'].count()
 data['preTestScore'].min()
 # value counts
 data.group.value_counts()
 # Correlation Matrix Of Values
 df.corr()

 pd.rolling_mean(df, 2)  # Calculate the moving average. That is, take
                        # the first two values, average them, 
                        # then drop the first and add the third, etc.

 #####
 # Mean preTestScores grouped by regiment and company
 #####
 #regiment    company
 #Dragoons    1st         3.5
 #            2nd        27.5
 #Nighthawks  1st        14.0
 #            2nd        16.5
 #Scouts      1st         2.5
 #            2nd         2.5
 #dtype: float64
 data['preTestScore'].groupby([df['regiment'], data['company']]).mean()

 ####
 # Mean preTestScores grouped by regiment and company without heirarchical indexing
 ####
 #company	1st	2nd
 #regiment		
 #Dragoons	3.5	27.5
 #Nighthawks	14.0	16.5
 #Scouts	2.5	2.5
 data['preTestScore'].groupby([data['regiment'], data['company']]).mean().unstack()

 #####
 # Group the entire dataframe by regiment and company
 #####
 #preTestScore	postTestScore
 #regiment	company		
 #Dragoons	1st	3.5	47.5
 #2nd	27.5	75.5
 #Nighthawks	1st	14.0	59.5
 #2nd	16.5	59.5
 #Scouts	1st	2.5	66.0
 #2nd	2.5	66.0

 df.groupby(['regiment', 'company']).mean()


 # Count the number of times each number of deaths occurs in each regiment
 # Input
 #	guardCorps	corps1	corps2	corps3	corps4	corps5	corps6	corps7	corps8	corps9	corps10	corps11	corps14	corps15
 # 1875	0	0	0	0	0	0	0	1	1	0	0	0	1	0
 # 1876	2	0	0	0	1	0	0	0	0	0	0	0	1	1
 # 1877	2	0	0	0	0	0	1	1	0	0	1	0	2	0


 result = horsekick.apply(pd.value_counts).fillna(0); result

 # Create a crosstab table by company and regiment


 # regiment	company	experience	name	preTestScore	postTestScore
 # 0	Nighthawks	infantry	veteran	Miller	4	25
 # 1	Nighthawks	infantry	rookie	Jacobson	24	94
 # 2	Nighthawks	cavalry	veteran	Ali	31	57
 # -->
 # company	cavalry	infantry	All
 # regiment			
 # Dragoons	2	2	4
 # Nighthawks	2	2	4

 # Counting the number of observations by regiment and category

 pd.crosstab(df.regiment, df.company, margins=True)


 df['preTestScore'].idxmax() # row with max value


	### single
	Y = data['TV'] # column
	Y = data.TV

	df[:2] # first two rows
	df.ix['Maricopa'] # view a row
	df.ix[:, 'coverage'] # view a column
	df.ix['Yuma', 'coverage'] # view the value based on a row and column

	## select rows
	# by index
	df.iloc[:2] # rows by row number
	df.iloc[1:2] # Select the second and third row
	df.iloc[2:] # Select every row after the third row
	df.iloc[3:6,0:3] #
	# by label
	df.loc[:'Arizona'] # all rows by index label
	df.ix[['Arizona', 'Texas']] # .ix is the combination of both .loc and .iloc. Integers are first considered labels,if not found, falls back on pos indexing

	# conditional
	df.query('A > C')
	df.query('A > 0')
	df.query('A > 0 & A < 1')
	df.query('A > B \| A > C')
	df[df['coverage'] > 50] # all rows where coverage is more than 50
	df[(df['deaths'] > 500) \| (df['deaths'] < 50)]
	df[(df['score'] > 1) & (df['score'] < 5)]
	df[~(df['regiment'] == 'Dragoons')] # Select all the regiments not named "Dragoons"

	df[df['age'].notnull() & df['sex'].notnull()] # ignore the missing data points

	# is in
	df[df.name.isin(value_list)] # value_list = ['Tina', 'Molly', 'Jason']
	df[~df.name.isin(value_list)]

	# partial matching
	df2[df2.E.str.contains("tw\|ou")]
	# regex
	df['raw'].str.contains('....-..-..', regex=True) # regex
	# where cells are arrays
	df[df['country'].map(lambda country: 'Syria' in country)]
	# random
	df.take(np.random.permutation(len(df))[:2])

	# select cells
	df.ix['Arizona', 2] # Select the third cell in the row named Arizona
	df.ix[2, 'deaths'] # Select the third cell down in the column named deaths


	# select cell ranges
	df['x3'][:1] # take column 'x3',

	# select columns
	df.iloc[:,:2] # Select the first 2 columns
	feature_cols = ['TV','Radio','Newspaper']
	x = data[feature_cols]
	data[['TV','Radio','Newspaper']]
	# column labels
	df.loc[:,['A','B']] # syntax is: df.loc[rows_index, cols_index]
	# conditional
	df.filter(like='data')


	df['preTestScore'].where(df['postTestScore'] > 50) # Find where a value exists in a column



	### sorting

	df.sort_values(by='reports', ascending=0) # Sort the dataframe's rows by reports, in descending order
	df.sort_values(by=['coverage', 'reports']) # Sort the dataframe's rows by coverage and then by reports, in ascending order



	# Exploring

	### general


	data.head()
	data.tail()
	data.tail().transpose()

	# dimensions
	data.shape()



	#List unique values in the df['name'] column
	df.name.unique()




	####
	# Stats
	####

	# Descriptive statistics by group
	data['preTestScore'].groupby(df['company']).describe()
	data['preTestScore'].describe()
	# Count the number of non-NA values
	data['preTestScore'].count()
	data['preTestScore'].min()
	# value counts
	data.group.value_counts()
	# Correlation Matrix Of Values
	df.corr()

	pd.rolling_mean(df, 2) # Calculate the moving average. That is, take
	# the first two values, average them,
	# then drop the first and add the third, etc.

	#####
	# Mean preTestScores grouped by regiment and company
	#####
	#regiment company
	#Dragoons 1st 3.5
	# 2nd 27.5
	#Nighthawks 1st 14.0
	# 2nd 16.5
	#Scouts 1st 2.5
	# 2nd 2.5
	#dtype: float64
	data['preTestScore'].groupby([df['regiment'], data['company']]).mean()

	####
	# Mean preTestScores grouped by regiment and company without heirarchical indexing
	####
	#company 1st 2nd
	#regiment
	#Dragoons 3.5 27.5
	#Nighthawks 14.0 16.5
	#Scouts 2.5 2.5
	data['preTestScore'].groupby([data['regiment'], data['company']]).mean().unstack()

	#####
	# Group the entire dataframe by regiment and company
	#####
	#preTestScore postTestScore
	#regiment company
	#Dragoons 1st 3.5 47.5
	#2nd 27.5 75.5
	#Nighthawks 1st 14.0 59.5
	#2nd 16.5 59.5
	#Scouts 1st 2.5 66.0
	#2nd 2.5 66.0

	df.groupby(['regiment', 'company']).mean()


	# Count the number of times each number of deaths occurs in each regiment
	# Input
	# guardCorps corps1 corps2 corps3 corps4 corps5 corps6 corps7 corps8 corps9 corps10 corps11 corps14 corps15
	# 1875 0 0 0 0 0 0 0 1 1 0 0 0 1 0
	# 1876 2 0 0 0 1 0 0 0 0 0 0 0 1 1
	# 1877 2 0 0 0 0 0 1 1 0 0 1 0 2 0


	result = horsekick.apply(pd.value_counts).fillna(0); result

	# Create a crosstab table by company and regiment


	# regiment company experience name preTestScore postTestScore
	# 0 Nighthawks infantry veteran Miller 4 25
	# 1 Nighthawks infantry rookie Jacobson 24 94
	# 2 Nighthawks cavalry veteran Ali 31 57
	# -->
	# company cavalry infantry All
	# regiment
	# Dragoons 2 2 4
	# Nighthawks 2 2 4

	# Counting the number of observations by regiment and category

	pd.crosstab(df.regiment, df.company, margins=True)


	df['preTestScore'].idxmax() # row with max value