BioSciEconomist · January 26, 2021 01:33
diff --git a/python basics.py b/python basics.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: python basics.py
 # | DATE: 1/25/21 (original: 12/28/18)
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:             
 # *----------------------------------------------------------------
 # | PURPOSE: examples of data munging and analysis in python
 # *----------------------------------------------------------------

 # see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html

 import pandas as pd # key library for manipulating data
 import numpy as np  # key library for numerical calculations

 pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation and limit to 2 decimals


 ### create a toy pandas data frame

 # define a dictionary

 data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
        'PIO':[160,150,146,138,142,146,150,154,158,162],
 		'MYC':[137,148,151,139,143,120,115,136,130,129],
 		'DEK':[150,149,145,140,144,148,152,156,160,164],
        'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
 		'PLOT':[1,2,3,4,5,6,7,8,9,10],
 		'BT': ['Y','Y',	'N','N','N','N','Y','N','Y','Y'],
 		'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
         'ID':[1,2,3,4,5,6,7,8,9,10]
 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
 print(df)

 ### inspecting your data

 df.columns
 list(df.columns) # untruncated list
 sorted(list(df.columns)) # sorted list
 df.head()
 df.tail()
 df.info()
 df.shape()
 df.index

 ### misc things you can do to manipulate values and fields

 df['ID'] = df['ID'].astype(str) # convert ID to string
 df['PLOT'] = df['PLOT'].astype(str) # convert plot to string
 df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer')

 ### example working with data types

 df['xvar'] = df['GARST'] # create a new colun 'xvar' as a copy of 'GARST'
 df['xvar'] = df['xvar'].astype(str) # convert to string

 # use pandas to convert back to numeric
 df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce')

 # pad leading zeros
 df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x))

 df.head() # check

 # create unique ID for each line in data frame
 df['ID2'] =  list(range(1,11))
 df.head() # check

 # Print out type of GARST
 print(type(df['GARST']))

 # Print out length of var1
 print(len(df['GARST']))

 # drop variables
 df.drop(['xvar', 'ID2'], axis=1)

 #-----------------------------------
 # high level summary stats
 #-----------------------------------

 data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
        'PIO':[160,150,146,138,142,146,150,154,158,162],
 		'MYC':[137,148,151,139,143,120,115,136,130,129],
 		'DEK':[150,149,145,140,144,148,152,156,160,164],
        'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
 		'PLOT':[1,2,3,4,5,6,7,8,9,10],
 		'BT': ['Y','Y',	'N','N','N','N','Y','N','Y','Y'],
 		'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
         'ID':[1,2,3,4,5,6,7,8,9,10]
 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
 print(df)


 df.describe() # summary stats for all numerical colunms

 df.GARST.mean()  or df['GARST'].mean()
 df.GARST.std()
 df.GARST.var()

 # Print the value counts for BT and RR
 print(df['BT'].value_counts(dropna=False))
 print(df['RR'].value_counts(dropna=False))

 # Specify array of percentiles: percentiles
 percentiles = [2.5,25,50,75,97.5]

 # Compute percentiles: ptiles_vers
 ptiles = np.percentile(df.GARST, percentiles)

 # Print the result
 print(ptiles)


 #-------------------------------------
 # data visualizatiion
 #------------------------------------

 # Import necessary modules
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns

 ### boxplot

 # Create the boxplot
 df.boxplot(column = 'GARST')
 df.boxplot(column='GARST', by='RR', rot=90)

 # Display the plot
 plt.show()


 ### Box-and-whisker plot with seaborn

 # Create box plot with Seaborn's default settings
 _ = sns.boxplot(x='RR', y='GARST', data=df)

 # Label the axes
 _ = plt.xlabel('RR')
 _ = plt.ylabel('yield')

 # Show the plot
 plt.show()


 ### histogram

 # Plot histogram of versicolor petal lengths
 _ = plt.hist(df.GARST)

 # Label axes
 _ = plt.xlabel('yield')
 _ = plt.ylabel('count')

 # Show histogram
 plt.show()



 ### histogram with pandas

 # Plot the PDF
 df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200))
 plt.show()

 # Plot the CDF
 df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200))
 plt.show()

 ### scatter plot

 _ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none')


 # Label the axes
 _ = plt.xlabel('garst')
 _ = plt.ylabel('pioneer')

 # Show the result
 plt.show()



 #------------------------------------
 # sorting, subsetting and basic data frame manipulation
 #------------------------------------

 # sort by GARST yield

 df= df.sort_values('GARST', ascending=False)
 print(df)

 df= df.sort_values('GARST')
 print(df)

 # sort ascending by trait (Y/N) and by descending GARST yield_data

 df = df.sort_values(['BT', 'GARST'], ascending=[True, False])
 print(df)

 df = df.sort_values(['BT', 'GARST'], ascending=[False, False])
 print(df)

 df = df.sort_values(['BT', 'GARST'], ascending=[False, True])
 print(df)

 # sort by index to restore original order
 df = df.sort_index()
 print(df)


 #---------------------------------------
 # misc
 #----------------------------------------

 # Print out GARST column as Pandas Series
 print(df['GARST'])

 # Print out GARST column as Pandas DataFrame (use double brackets)
 print(df[['GARST']])

 # Print out DataFrame with GARST and PLOT columns
 print(df[['GARST','PLOT']])

 # subset data via variable selection  
 my_hybrids = df[['GARST','PIO']]
 my_hybrids.head() # check
 my_hybrids.info() # check

 ### example using .loc - this isn't always useful but sometimes 
 ### I have used it in loops where you loop over data and modify specific fields

 # Create the list of column labels: cols and get rows
 cols = ['GARST','PIO','PLOT']
 rows = df.index # use the index from the original data fram (df)

 # Create the new DataFrame
 my_hybrids= df.loc[rows,cols]
 print(my_hybrids)

 # get only the first three rows
 cols = ['GARST','PIO','PLOT']
 rows = [0,1,2]
 my_hybrids= df.loc[rows,cols]
 print(my_hybrids)


 #-------------------------------------------
 # creating flags and categorical variables
 #-----------------------------------------

 mask = df['GARST'] > 150 

 # create a new data set that is filtered by the definition above
 hi_garst = df[mask] 
 print(hi_garst) 

 # create a binary flag in original data
 df['flag'] = np.where(df['GARST'] > 150, 1,0)
 print(df)


 ### create categories based on value ranges

 conditions = [
    (df['GARST'] < 140),
    (df['GARST'] >= 140) & (df['GARST'] < 150),
    (df['GARST'] >= 150)]
 choices = ['low', 'med', 'high']
 df['lvl'] = np.select(conditions, choices, default='na')
 print(df)

 #--------------------------------------
 # group by operations
 #--------------------------------------

 df.groupby('BT')['GARST'].mean() # average yield for garst BT



 #-------------------------------------------
 # joining tables
 #-------------------------------------------


 # 
 # create toy database tables
 #

 # create customer table
 data = {'id':[1,2,3,4,5,6,7,8,9],'acres':[1800,1970,980,960,970,1500,700,2500,2980]}
 customer = pd.DataFrame(data,columns =['id','acres'])

 # create product table
 data = {'id':[1,2,3,4,5,6,7,8,9],'type':['G8590','G8590','G8484','P8787','P8787','G8590','G8590','P8787','G8590']}
 product = pd.DataFrame(data,columns =['id','type'])

 # create tech table
 data = {'type':['G8590','G8484','P8787'],'trait':['BT','RR','RW']}
 tech = pd.DataFrame(data,columns =['type','trait'])

 # create customer yield table
 data = {'id':[1,2,3,4,5,6,7,8,9],'yield':[160,165,180,200,175,149,168,300,170]}
 customer_yield = pd.DataFrame(data,columns =['id','yield'])

 #
 # example merges
 #

 temp1_add_product = pd.merge(customer,product[['type','id']], on='id', how='left')

 temp2_add_tech = pd.merge(temp1_add_product,tech[['type','trait']], on='type', how='left')

 temp3_add_yield = pd.merge(temp2_add_tech,customer_yield[['id','yield']], on='id', how='left')

 # we could do all of these joins in one step
 temp1_add_all = pd.merge(pd.merge(pd.merge(customer,product[['type','id']],on='id',how='left'),
                                  tech[['type','trait']],on='type',how = 'left'),
                                  customer_yield[['id','yield']], on='id', how='left')
 #----------------------------------
 # basic regression
 #----------------------------------


 # generate some data
 data = {'id' :[1,2,3,4,5,6,7,8,9,10],
        'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
        'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'],
 		'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
 		'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000],
        'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0],
 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push'])
 print(df)

 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation
 
 # raw comparisons
 results = smf.ols('wtchg ~ treat', data=df).fit() # linear probability model

 results.summary() 

 results.summary2() # it is not a problem here, but one of the problems with stats models 
                   # I have read is you can't override scientific notation but summary2 
                   # will usually help with that

 results.params # this gives you just the estimated co-efficients


 # regression controlling for historical calorie intake
 results = smf.ols('wtchg ~ treat + calories', data=df).fit()
 results.summary()
 results.params

 # 
 # logistic regression
 #

 # simulate more data
 df = pd.DataFrame(columns=['ID','treat','admit', 'miles'])
 df['ID'] = list(range(1,501))
 df['treat'] = np.random.choice([0, 1], p=[0.5, 0.5], size=(500))
 df['admit'] = np.random.choice([0, 1], p=[0.7, 0.3], size=(500))
 df['miles'] = np.random.uniform(10, 150, df.shape[0])

 df.head() # check
 df.describe()

 # logistic regression

 model = smf.glm('admit ~ treat + miles', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
 result = model.fit()
 print(result.summary())

 #----------------------------------
 # working with duplicated data
 #----------------------------------

 data = {'WT' :[150,148,145,200,198,196,191,175,175,161],
        'HT':[72,72,72,68,68,68,68,69,69,71],
 		'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'],
 		'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'],
         'PLAN':['A','A','B','A','A','B','B','A','A','B'],
         'ID':[1,1,1,2,2,2,2,3,3,4]
 }

 # convert to a data frame
 tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN'])
 print(tmp)

 # get unique members
 tmp2 = tmp.drop_duplicates(['ID'])
 print(tmp2)

 # get unique id and plan combinations
 tmp3 = tmp.drop_duplicates(['ID','PLAN'])
 print(tmp3)

 # identify duplicates (example)
 tmp4 = tmp.groupby(['ID']).size().reset_index(name='count') # count duplicates
 tmp4 = tmp4.sort_values(['count'], ascending=[False]) # sort
 print(tmp4) # check

 #---------------------------------------------
 # transposing or reshaping data
 #---------------------------------------------

 # use melt to convert columns into rows
 df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield')

 print(df2) # check
 df2.info() # check

 df2['yield'] = pd.to_numeric(df2['yield'], errors='coerce') # fix loss of format

 # pivot this data back into tidy form
 df3 =  df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield')
 df3.head() # check

 # convert indices back to columns
 df3.reset_index(level=['ID','BT','RR'])

 df3.reset_index() # this would have worked too

 df3.info()

 ### example with toy panel data

 # create toy panel (long) data
 data = {'id' :[1,1,1,2,2,2,3,3,3],
        'measure':["depth","temp","width","depth","temp","width","depth","temp","width"],
 		'values': [2,50,18,1.5,53,18,2.5,60,18],

 }

 # convert to a data frame
 tmp = pd.DataFrame(data,columns=['id','measure','values'])
 print(tmp)


 # pivot this data back into tidy form (wide)
 df1 =  tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values')

 # convert indices back to columns
 df1 = df1.reset_index()

 print(df1) # check

 # use melt to convert columns back into rows (panel or long)                
 df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values')
 print(df2)

 #----------------------------------------
 # working with dates
 #----------------------------------------

 ### create data frame with date time index

 data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'], 
        'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]}
 df = pd.DataFrame(data, columns = ['date', 'aphids'])
 print(df)

 df.info()

 # Convert df['date'] from string to datetime

 # Prepare a format string: time_format
 time_format = '%Y-%m-%d %H:%M'

 # Convert date_list into a datetime object: my_datetimes
 df['date'] = pd.to_datetime(df['date'], format=time_format)  

 df.info()

 # function for parsing calendar date
 import datetime as dt

 def get_day(x): return dt.datetime(x.year, x.month, x.day) 

 # Create day column
 df['day'] = df['date'].apply(get_day) 
 df.head()

 # Set df['date'] as the index and delete the column
 df.index = df['date']
 del df['date'] # since its an index now delete the column
 df # check
 df.info() # check

 ### date operations

 # Extract data from 2pm to 4pm on '2014-05-02': ts1
 ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00']
 print(ts1)

 # Extract '2014-05-02': ts2
 ts2 = df.loc['2014-05-02']
 print(ts2)

 # Extract data from '2014-05-03' to '22014-05-02': ts3
 ts3 = df.loc['2014-05-03':'2014-05-05']
 print(ts3)


 # Downsample to get total within 2 hours 
 df1 = df['aphids'].resample('2h').sum()
 print(df) # compare
 print(df1) # check

 # Downsample to get daily total aphid counts
 df1 = df['aphids'].resample('D').sum()
 print(df) # compare
 print(df1)

 # get daily high counts
 daily_highs = df['aphids'].resample('D').max()
 print(daily_highs)

 # get counts for april
 april = df['aphids']['2014-Apr']
 print(april)


 #--------------------------------------
 # missing data
 #--------------------------------------

 data = {'GARST' :[150,'Nan',145,137,'Nan',145,149,'Nan',157,161],
        'PIO':[160,150,146,138,142,146,150,154,158,162],
 		'MYC':[137,148,151,139,143,'Nan',115,136,130,129],
 		'DEK':[150,149,145,140,144,148,152,156,160,164],
        'WYF':[110,125,135,122,'Nan',132,130,'NaN',147,119],
 		'PLOT':[1,2,3,4,5,6,7,8,9,10],
 		'BT': ['Y','Y',	'N','N','N','N','Y','N','Y','Y'],
 		'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
         'ID':[1,2,3,4,5,6,7,8,9,10]
 }

 # convert to a data frame
 df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
 print(df)

 df.info()

 # cleanup
 df['GARST'] = pd.to_numeric(df['GARST'], errors='coerce')
 df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce')
 df['MYC'] = pd.to_numeric(df['MYC'], errors='coerce')


 df.describe() # check out our toy data

 df.isnull().sum(axis=0) # count of missing values by variable in dataframe


 #------------------------------------
 # loops and functions
 #-----------------------------------

 ### loop over a list
    
 # ex

 garst= [150,140,145,137,141]
 for yields in garst : 
    print(yields)
    
 ###  basic loop
 x = 1
 while x < 4 :
    print(x)
    x = x + 1
    
 
 # define function
 def square(num):
    new_value = num ** 2
    return new_value

 square(5) # call function
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: python basics.py
	# \| DATE: 1/25/21 (original: 12/28/18)
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: examples of data munging and analysis in python
	# *----------------------------------------------------------------

	# see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html

	import pandas as pd # key library for manipulating data
	import numpy as np # key library for numerical calculations

	pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation and limit to 2 decimals


	### create a toy pandas data frame

	# define a dictionary

	data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
	'PIO':[160,150,146,138,142,146,150,154,158,162],
	'MYC':[137,148,151,139,143,120,115,136,130,129],
	'DEK':[150,149,145,140,144,148,152,156,160,164],
	'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
	'PLOT':[1,2,3,4,5,6,7,8,9,10],
	'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
	'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
	'ID':[1,2,3,4,5,6,7,8,9,10]
	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
	print(df)

	### inspecting your data

	df.columns
	list(df.columns) # untruncated list
	sorted(list(df.columns)) # sorted list
	df.head()
	df.tail()
	df.info()
	df.shape()
	df.index

	### misc things you can do to manipulate values and fields

	df['ID'] = df['ID'].astype(str) # convert ID to string
	df['PLOT'] = df['PLOT'].astype(str) # convert plot to string
	df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer')

	### example working with data types

	df['xvar'] = df['GARST'] # create a new colun 'xvar' as a copy of 'GARST'
	df['xvar'] = df['xvar'].astype(str) # convert to string

	# use pandas to convert back to numeric
	df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce')

	# pad leading zeros
	df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x))

	df.head() # check

	# create unique ID for each line in data frame
	df['ID2'] = list(range(1,11))
	df.head() # check

	# Print out type of GARST
	print(type(df['GARST']))

	# Print out length of var1
	print(len(df['GARST']))

	# drop variables
	df.drop(['xvar', 'ID2'], axis=1)

	#-----------------------------------
	# high level summary stats
	#-----------------------------------

	data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
	'PIO':[160,150,146,138,142,146,150,154,158,162],
	'MYC':[137,148,151,139,143,120,115,136,130,129],
	'DEK':[150,149,145,140,144,148,152,156,160,164],
	'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
	'PLOT':[1,2,3,4,5,6,7,8,9,10],
	'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
	'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
	'ID':[1,2,3,4,5,6,7,8,9,10]
	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
	print(df)


	df.describe() # summary stats for all numerical colunms

	df.GARST.mean() or df['GARST'].mean()
	df.GARST.std()
	df.GARST.var()

	# Print the value counts for BT and RR
	print(df['BT'].value_counts(dropna=False))
	print(df['RR'].value_counts(dropna=False))

	# Specify array of percentiles: percentiles
	percentiles = [2.5,25,50,75,97.5]

	# Compute percentiles: ptiles_vers
	ptiles = np.percentile(df.GARST, percentiles)

	# Print the result
	print(ptiles)


	#-------------------------------------
	# data visualizatiion
	#------------------------------------

	# Import necessary modules
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns

	### boxplot

	# Create the boxplot
	df.boxplot(column = 'GARST')
	df.boxplot(column='GARST', by='RR', rot=90)

	# Display the plot
	plt.show()


	### Box-and-whisker plot with seaborn

	# Create box plot with Seaborn's default settings
	_ = sns.boxplot(x='RR', y='GARST', data=df)

	# Label the axes
	_ = plt.xlabel('RR')
	_ = plt.ylabel('yield')

	# Show the plot
	plt.show()


	### histogram

	# Plot histogram of versicolor petal lengths
	_ = plt.hist(df.GARST)

	# Label axes
	_ = plt.xlabel('yield')
	_ = plt.ylabel('count')

	# Show histogram
	plt.show()



	### histogram with pandas

	# Plot the PDF
	df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200))
	plt.show()

	# Plot the CDF
	df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200))
	plt.show()

	### scatter plot

	_ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none')


	# Label the axes
	_ = plt.xlabel('garst')
	_ = plt.ylabel('pioneer')

	# Show the result
	plt.show()



	#------------------------------------
	# sorting, subsetting and basic data frame manipulation
	#------------------------------------

	# sort by GARST yield

	df= df.sort_values('GARST', ascending=False)
	print(df)

	df= df.sort_values('GARST')
	print(df)

	# sort ascending by trait (Y/N) and by descending GARST yield_data

	df = df.sort_values(['BT', 'GARST'], ascending=[True, False])
	print(df)

	df = df.sort_values(['BT', 'GARST'], ascending=[False, False])
	print(df)

	df = df.sort_values(['BT', 'GARST'], ascending=[False, True])
	print(df)

	# sort by index to restore original order
	df = df.sort_index()
	print(df)


	#---------------------------------------
	# misc
	#----------------------------------------

	# Print out GARST column as Pandas Series
	print(df['GARST'])

	# Print out GARST column as Pandas DataFrame (use double brackets)
	print(df[['GARST']])

	# Print out DataFrame with GARST and PLOT columns
	print(df[['GARST','PLOT']])

	# subset data via variable selection
	my_hybrids = df[['GARST','PIO']]
	my_hybrids.head() # check
	my_hybrids.info() # check

	### example using .loc - this isn't always useful but sometimes
	### I have used it in loops where you loop over data and modify specific fields

	# Create the list of column labels: cols and get rows
	cols = ['GARST','PIO','PLOT']
	rows = df.index # use the index from the original data fram (df)

	# Create the new DataFrame
	my_hybrids= df.loc[rows,cols]
	print(my_hybrids)

	# get only the first three rows
	cols = ['GARST','PIO','PLOT']
	rows = [0,1,2]
	my_hybrids= df.loc[rows,cols]
	print(my_hybrids)


	#-------------------------------------------
	# creating flags and categorical variables
	#-----------------------------------------

	mask = df['GARST'] > 150

	# create a new data set that is filtered by the definition above
	hi_garst = df[mask]
	print(hi_garst)

	# create a binary flag in original data
	df['flag'] = np.where(df['GARST'] > 150, 1,0)
	print(df)


	### create categories based on value ranges

	conditions = [
	(df['GARST'] < 140),
	(df['GARST'] >= 140) & (df['GARST'] < 150),
	(df['GARST'] >= 150)]
	choices = ['low', 'med', 'high']
	df['lvl'] = np.select(conditions, choices, default='na')
	print(df)

	#--------------------------------------
	# group by operations
	#--------------------------------------

	df.groupby('BT')['GARST'].mean() # average yield for garst BT



	#-------------------------------------------
	# joining tables
	#-------------------------------------------


	#
	# create toy database tables
	#

	# create customer table
	data = {'id':[1,2,3,4,5,6,7,8,9],'acres':[1800,1970,980,960,970,1500,700,2500,2980]}
	customer = pd.DataFrame(data,columns =['id','acres'])

	# create product table
	data = {'id':[1,2,3,4,5,6,7,8,9],'type':['G8590','G8590','G8484','P8787','P8787','G8590','G8590','P8787','G8590']}
	product = pd.DataFrame(data,columns =['id','type'])

	# create tech table
	data = {'type':['G8590','G8484','P8787'],'trait':['BT','RR','RW']}
	tech = pd.DataFrame(data,columns =['type','trait'])

	# create customer yield table
	data = {'id':[1,2,3,4,5,6,7,8,9],'yield':[160,165,180,200,175,149,168,300,170]}
	customer_yield = pd.DataFrame(data,columns =['id','yield'])

	#
	# example merges
	#

	temp1_add_product = pd.merge(customer,product[['type','id']], on='id', how='left')

	temp2_add_tech = pd.merge(temp1_add_product,tech[['type','trait']], on='type', how='left')

	temp3_add_yield = pd.merge(temp2_add_tech,customer_yield[['id','yield']], on='id', how='left')

	# we could do all of these joins in one step
	temp1_add_all = pd.merge(pd.merge(pd.merge(customer,product[['type','id']],on='id',how='left'),
	tech[['type','trait']],on='type',how = 'left'),
	customer_yield[['id','yield']], on='id', how='left')
	#----------------------------------
	# basic regression
	#----------------------------------


	# generate some data
	data = {'id' :[1,2,3,4,5,6,7,8,9,10],
	'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
	'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'],
	'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
	'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000],
	'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0],
	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push'])
	print(df)

	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	# raw comparisons
	results = smf.ols('wtchg ~ treat', data=df).fit() # linear probability model

	results.summary()

	results.summary2() # it is not a problem here, but one of the problems with stats models
	# I have read is you can't override scientific notation but summary2
	# will usually help with that

	results.params # this gives you just the estimated co-efficients


	# regression controlling for historical calorie intake
	results = smf.ols('wtchg ~ treat + calories', data=df).fit()
	results.summary()
	results.params

	#
	# logistic regression
	#

	# simulate more data
	df = pd.DataFrame(columns=['ID','treat','admit', 'miles'])
	df['ID'] = list(range(1,501))
	df['treat'] = np.random.choice([0, 1], p=[0.5, 0.5], size=(500))
	df['admit'] = np.random.choice([0, 1], p=[0.7, 0.3], size=(500))
	df['miles'] = np.random.uniform(10, 150, df.shape[0])

	df.head() # check
	df.describe()

	# logistic regression

	model = smf.glm('admit ~ treat + miles', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
	result = model.fit()
	print(result.summary())

	#----------------------------------
	# working with duplicated data
	#----------------------------------

	data = {'WT' :[150,148,145,200,198,196,191,175,175,161],
	'HT':[72,72,72,68,68,68,68,69,69,71],
	'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'],
	'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'],
	'PLAN':['A','A','B','A','A','B','B','A','A','B'],
	'ID':[1,1,1,2,2,2,2,3,3,4]
	}

	# convert to a data frame
	tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN'])
	print(tmp)

	# get unique members
	tmp2 = tmp.drop_duplicates(['ID'])
	print(tmp2)

	# get unique id and plan combinations
	tmp3 = tmp.drop_duplicates(['ID','PLAN'])
	print(tmp3)

	# identify duplicates (example)
	tmp4 = tmp.groupby(['ID']).size().reset_index(name='count') # count duplicates
	tmp4 = tmp4.sort_values(['count'], ascending=[False]) # sort
	print(tmp4) # check

	#---------------------------------------------
	# transposing or reshaping data
	#---------------------------------------------

	# use melt to convert columns into rows
	df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield')

	print(df2) # check
	df2.info() # check

	df2['yield'] = pd.to_numeric(df2['yield'], errors='coerce') # fix loss of format

	# pivot this data back into tidy form
	df3 = df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield')
	df3.head() # check

	# convert indices back to columns
	df3.reset_index(level=['ID','BT','RR'])

	df3.reset_index() # this would have worked too

	df3.info()

	### example with toy panel data

	# create toy panel (long) data
	data = {'id' :[1,1,1,2,2,2,3,3,3],
	'measure':["depth","temp","width","depth","temp","width","depth","temp","width"],
	'values': [2,50,18,1.5,53,18,2.5,60,18],

	}

	# convert to a data frame
	tmp = pd.DataFrame(data,columns=['id','measure','values'])
	print(tmp)


	# pivot this data back into tidy form (wide)
	df1 = tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values')

	# convert indices back to columns
	df1 = df1.reset_index()

	print(df1) # check

	# use melt to convert columns back into rows (panel or long)
	df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values')
	print(df2)

	#----------------------------------------
	# working with dates
	#----------------------------------------

	### create data frame with date time index

	data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'],
	'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]}
	df = pd.DataFrame(data, columns = ['date', 'aphids'])
	print(df)

	df.info()

	# Convert df['date'] from string to datetime

	# Prepare a format string: time_format
	time_format = '%Y-%m-%d %H:%M'

	# Convert date_list into a datetime object: my_datetimes
	df['date'] = pd.to_datetime(df['date'], format=time_format)

	df.info()

	# function for parsing calendar date
	import datetime as dt

	def get_day(x): return dt.datetime(x.year, x.month, x.day)

	# Create day column
	df['day'] = df['date'].apply(get_day)
	df.head()

	# Set df['date'] as the index and delete the column
	df.index = df['date']
	del df['date'] # since its an index now delete the column
	df # check
	df.info() # check

	### date operations

	# Extract data from 2pm to 4pm on '2014-05-02': ts1
	ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00']
	print(ts1)

	# Extract '2014-05-02': ts2
	ts2 = df.loc['2014-05-02']
	print(ts2)

	# Extract data from '2014-05-03' to '22014-05-02': ts3
	ts3 = df.loc['2014-05-03':'2014-05-05']
	print(ts3)


	# Downsample to get total within 2 hours
	df1 = df['aphids'].resample('2h').sum()
	print(df) # compare
	print(df1) # check

	# Downsample to get daily total aphid counts
	df1 = df['aphids'].resample('D').sum()
	print(df) # compare
	print(df1)

	# get daily high counts
	daily_highs = df['aphids'].resample('D').max()
	print(daily_highs)

	# get counts for april
	april = df['aphids']['2014-Apr']
	print(april)


	#--------------------------------------
	# missing data
	#--------------------------------------

	data = {'GARST' :[150,'Nan',145,137,'Nan',145,149,'Nan',157,161],
	'PIO':[160,150,146,138,142,146,150,154,158,162],
	'MYC':[137,148,151,139,143,'Nan',115,136,130,129],
	'DEK':[150,149,145,140,144,148,152,156,160,164],
	'WYF':[110,125,135,122,'Nan',132,130,'NaN',147,119],
	'PLOT':[1,2,3,4,5,6,7,8,9,10],
	'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
	'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
	'ID':[1,2,3,4,5,6,7,8,9,10]
	}

	# convert to a data frame
	df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
	print(df)

	df.info()

	# cleanup
	df['GARST'] = pd.to_numeric(df['GARST'], errors='coerce')
	df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce')
	df['MYC'] = pd.to_numeric(df['MYC'], errors='coerce')


	df.describe() # check out our toy data

	df.isnull().sum(axis=0) # count of missing values by variable in dataframe


	#------------------------------------
	# loops and functions
	#-----------------------------------

	### loop over a list

	# ex

	garst= [150,140,145,137,141]
	for yields in garst :
	print(yields)

	### basic loop
	x = 1
	while x < 4 :
	print(x)
	x = x + 1


	# define function
	def square(num):
	new_value = num ** 2
	return new_value

	square(5) # call function