Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active January 26, 2021 01:33
Show Gist options
  • Save BioSciEconomist/2eaf876a6ebb7cae5357b67705a4d3dd to your computer and use it in GitHub Desktop.
Save BioSciEconomist/2eaf876a6ebb7cae5357b67705a4d3dd to your computer and use it in GitHub Desktop.
examples of data munging and analysis in python
# *-----------------------------------------------------------------
# | PROGRAM NAME: python basics.py
# | DATE: 1/25/21 (original: 12/28/18)
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: examples of data munging and analysis in python
# *----------------------------------------------------------------
# see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html
import pandas as pd # key library for manipulating data
import numpy as np # key library for numerical calculations
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation and limit to 2 decimals
### create a toy pandas data frame
# define a dictionary
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
'ID':[1,2,3,4,5,6,7,8,9,10]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
print(df)
### inspecting your data
df.columns
list(df.columns) # untruncated list
sorted(list(df.columns)) # sorted list
df.head()
df.tail()
df.info()
df.shape()
df.index
### misc things you can do to manipulate values and fields
df['ID'] = df['ID'].astype(str) # convert ID to string
df['PLOT'] = df['PLOT'].astype(str) # convert plot to string
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer')
### example working with data types
df['xvar'] = df['GARST'] # create a new colun 'xvar' as a copy of 'GARST'
df['xvar'] = df['xvar'].astype(str) # convert to string
# use pandas to convert back to numeric
df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce')
# pad leading zeros
df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x))
df.head() # check
# create unique ID for each line in data frame
df['ID2'] = list(range(1,11))
df.head() # check
# Print out type of GARST
print(type(df['GARST']))
# Print out length of var1
print(len(df['GARST']))
# drop variables
df.drop(['xvar', 'ID2'], axis=1)
#-----------------------------------
# high level summary stats
#-----------------------------------
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'WYF':[110,125,135,122,127,132,130,'NaN',147,119],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
'ID':[1,2,3,4,5,6,7,8,9,10]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
print(df)
df.describe() # summary stats for all numerical colunms
df.GARST.mean() or df['GARST'].mean()
df.GARST.std()
df.GARST.var()
# Print the value counts for BT and RR
print(df['BT'].value_counts(dropna=False))
print(df['RR'].value_counts(dropna=False))
# Specify array of percentiles: percentiles
percentiles = [2.5,25,50,75,97.5]
# Compute percentiles: ptiles_vers
ptiles = np.percentile(df.GARST, percentiles)
# Print the result
print(ptiles)
#-------------------------------------
# data visualizatiion
#------------------------------------
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
### boxplot
# Create the boxplot
df.boxplot(column = 'GARST')
df.boxplot(column='GARST', by='RR', rot=90)
# Display the plot
plt.show()
### Box-and-whisker plot with seaborn
# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='RR', y='GARST', data=df)
# Label the axes
_ = plt.xlabel('RR')
_ = plt.ylabel('yield')
# Show the plot
plt.show()
### histogram
# Plot histogram of versicolor petal lengths
_ = plt.hist(df.GARST)
# Label axes
_ = plt.xlabel('yield')
_ = plt.ylabel('count')
# Show histogram
plt.show()
### histogram with pandas
# Plot the PDF
df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200))
plt.show()
# Plot the CDF
df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200))
plt.show()
### scatter plot
_ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none')
# Label the axes
_ = plt.xlabel('garst')
_ = plt.ylabel('pioneer')
# Show the result
plt.show()
#------------------------------------
# sorting, subsetting and basic data frame manipulation
#------------------------------------
# sort by GARST yield
df= df.sort_values('GARST', ascending=False)
print(df)
df= df.sort_values('GARST')
print(df)
# sort ascending by trait (Y/N) and by descending GARST yield_data
df = df.sort_values(['BT', 'GARST'], ascending=[True, False])
print(df)
df = df.sort_values(['BT', 'GARST'], ascending=[False, False])
print(df)
df = df.sort_values(['BT', 'GARST'], ascending=[False, True])
print(df)
# sort by index to restore original order
df = df.sort_index()
print(df)
#---------------------------------------
# misc
#----------------------------------------
# Print out GARST column as Pandas Series
print(df['GARST'])
# Print out GARST column as Pandas DataFrame (use double brackets)
print(df[['GARST']])
# Print out DataFrame with GARST and PLOT columns
print(df[['GARST','PLOT']])
# subset data via variable selection
my_hybrids = df[['GARST','PIO']]
my_hybrids.head() # check
my_hybrids.info() # check
### example using .loc - this isn't always useful but sometimes
### I have used it in loops where you loop over data and modify specific fields
# Create the list of column labels: cols and get rows
cols = ['GARST','PIO','PLOT']
rows = df.index # use the index from the original data fram (df)
# Create the new DataFrame
my_hybrids= df.loc[rows,cols]
print(my_hybrids)
# get only the first three rows
cols = ['GARST','PIO','PLOT']
rows = [0,1,2]
my_hybrids= df.loc[rows,cols]
print(my_hybrids)
#-------------------------------------------
# creating flags and categorical variables
#-----------------------------------------
mask = df['GARST'] > 150
# create a new data set that is filtered by the definition above
hi_garst = df[mask]
print(hi_garst)
# create a binary flag in original data
df['flag'] = np.where(df['GARST'] > 150, 1,0)
print(df)
### create categories based on value ranges
conditions = [
(df['GARST'] < 140),
(df['GARST'] >= 140) & (df['GARST'] < 150),
(df['GARST'] >= 150)]
choices = ['low', 'med', 'high']
df['lvl'] = np.select(conditions, choices, default='na')
print(df)
#--------------------------------------
# group by operations
#--------------------------------------
df.groupby('BT')['GARST'].mean() # average yield for garst BT
#-------------------------------------------
# joining tables
#-------------------------------------------
#
# create toy database tables
#
# create customer table
data = {'id':[1,2,3,4,5,6,7,8,9],'acres':[1800,1970,980,960,970,1500,700,2500,2980]}
customer = pd.DataFrame(data,columns =['id','acres'])
# create product table
data = {'id':[1,2,3,4,5,6,7,8,9],'type':['G8590','G8590','G8484','P8787','P8787','G8590','G8590','P8787','G8590']}
product = pd.DataFrame(data,columns =['id','type'])
# create tech table
data = {'type':['G8590','G8484','P8787'],'trait':['BT','RR','RW']}
tech = pd.DataFrame(data,columns =['type','trait'])
# create customer yield table
data = {'id':[1,2,3,4,5,6,7,8,9],'yield':[160,165,180,200,175,149,168,300,170]}
customer_yield = pd.DataFrame(data,columns =['id','yield'])
#
# example merges
#
temp1_add_product = pd.merge(customer,product[['type','id']], on='id', how='left')
temp2_add_tech = pd.merge(temp1_add_product,tech[['type','trait']], on='type', how='left')
temp3_add_yield = pd.merge(temp2_add_tech,customer_yield[['id','yield']], on='id', how='left')
# we could do all of these joins in one step
temp1_add_all = pd.merge(pd.merge(pd.merge(customer,product[['type','id']],on='id',how='left'),
tech[['type','trait']],on='type',how = 'left'),
customer_yield[['id','yield']], on='id', how='left')
#----------------------------------
# basic regression
#----------------------------------
# generate some data
data = {'id' :[1,2,3,4,5,6,7,8,9,10],
'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0],
'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'],
'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2],
'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000],
'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0],
}
# convert to a data frame
df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push'])
print(df)
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
# raw comparisons
results = smf.ols('wtchg ~ treat', data=df).fit() # linear probability model
results.summary()
results.summary2() # it is not a problem here, but one of the problems with stats models
# I have read is you can't override scientific notation but summary2
# will usually help with that
results.params # this gives you just the estimated co-efficients
# regression controlling for historical calorie intake
results = smf.ols('wtchg ~ treat + calories', data=df).fit()
results.summary()
results.params
#
# logistic regression
#
# simulate more data
df = pd.DataFrame(columns=['ID','treat','admit', 'miles'])
df['ID'] = list(range(1,501))
df['treat'] = np.random.choice([0, 1], p=[0.5, 0.5], size=(500))
df['admit'] = np.random.choice([0, 1], p=[0.7, 0.3], size=(500))
df['miles'] = np.random.uniform(10, 150, df.shape[0])
df.head() # check
df.describe()
# logistic regression
model = smf.glm('admit ~ treat + miles', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
result = model.fit()
print(result.summary())
#----------------------------------
# working with duplicated data
#----------------------------------
data = {'WT' :[150,148,145,200,198,196,191,175,175,161],
'HT':[72,72,72,68,68,68,68,69,69,71],
'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'],
'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'],
'PLAN':['A','A','B','A','A','B','B','A','A','B'],
'ID':[1,1,1,2,2,2,2,3,3,4]
}
# convert to a data frame
tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN'])
print(tmp)
# get unique members
tmp2 = tmp.drop_duplicates(['ID'])
print(tmp2)
# get unique id and plan combinations
tmp3 = tmp.drop_duplicates(['ID','PLAN'])
print(tmp3)
# identify duplicates (example)
tmp4 = tmp.groupby(['ID']).size().reset_index(name='count') # count duplicates
tmp4 = tmp4.sort_values(['count'], ascending=[False]) # sort
print(tmp4) # check
#---------------------------------------------
# transposing or reshaping data
#---------------------------------------------
# use melt to convert columns into rows
df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield')
print(df2) # check
df2.info() # check
df2['yield'] = pd.to_numeric(df2['yield'], errors='coerce') # fix loss of format
# pivot this data back into tidy form
df3 = df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield')
df3.head() # check
# convert indices back to columns
df3.reset_index(level=['ID','BT','RR'])
df3.reset_index() # this would have worked too
df3.info()
### example with toy panel data
# create toy panel (long) data
data = {'id' :[1,1,1,2,2,2,3,3,3],
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"],
'values': [2,50,18,1.5,53,18,2.5,60,18],
}
# convert to a data frame
tmp = pd.DataFrame(data,columns=['id','measure','values'])
print(tmp)
# pivot this data back into tidy form (wide)
df1 = tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values')
# convert indices back to columns
df1 = df1.reset_index()
print(df1) # check
# use melt to convert columns back into rows (panel or long)
df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values')
print(df2)
#----------------------------------------
# working with dates
#----------------------------------------
### create data frame with date time index
data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'],
'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]}
df = pd.DataFrame(data, columns = ['date', 'aphids'])
print(df)
df.info()
# Convert df['date'] from string to datetime
# Prepare a format string: time_format
time_format = '%Y-%m-%d %H:%M'
# Convert date_list into a datetime object: my_datetimes
df['date'] = pd.to_datetime(df['date'], format=time_format)
df.info()
# function for parsing calendar date
import datetime as dt
def get_day(x): return dt.datetime(x.year, x.month, x.day)
# Create day column
df['day'] = df['date'].apply(get_day)
df.head()
# Set df['date'] as the index and delete the column
df.index = df['date']
del df['date'] # since its an index now delete the column
df # check
df.info() # check
### date operations
# Extract data from 2pm to 4pm on '2014-05-02': ts1
ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00']
print(ts1)
# Extract '2014-05-02': ts2
ts2 = df.loc['2014-05-02']
print(ts2)
# Extract data from '2014-05-03' to '22014-05-02': ts3
ts3 = df.loc['2014-05-03':'2014-05-05']
print(ts3)
# Downsample to get total within 2 hours
df1 = df['aphids'].resample('2h').sum()
print(df) # compare
print(df1) # check
# Downsample to get daily total aphid counts
df1 = df['aphids'].resample('D').sum()
print(df) # compare
print(df1)
# get daily high counts
daily_highs = df['aphids'].resample('D').max()
print(daily_highs)
# get counts for april
april = df['aphids']['2014-Apr']
print(april)
#--------------------------------------
# missing data
#--------------------------------------
data = {'GARST' :[150,'Nan',145,137,'Nan',145,149,'Nan',157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,'Nan',115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'WYF':[110,125,135,122,'Nan',132,130,'NaN',147,119],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
'ID':[1,2,3,4,5,6,7,8,9,10]
}
# convert to a data frame
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR'])
print(df)
df.info()
# cleanup
df['GARST'] = pd.to_numeric(df['GARST'], errors='coerce')
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce')
df['MYC'] = pd.to_numeric(df['MYC'], errors='coerce')
df.describe() # check out our toy data
df.isnull().sum(axis=0) # count of missing values by variable in dataframe
#------------------------------------
# loops and functions
#-----------------------------------
### loop over a list
# ex
garst= [150,140,145,137,141]
for yields in garst :
print(yields)
### basic loop
x = 1
while x < 4 :
print(x)
x = x + 1
# define function
def square(num):
new_value = num ** 2
return new_value
square(5) # call function
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment