Last active
January 26, 2021 01:33
-
-
Save BioSciEconomist/2eaf876a6ebb7cae5357b67705a4d3dd to your computer and use it in GitHub Desktop.
examples of data munging and analysis in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: python basics.py | |
# | DATE: 1/25/21 (original: 12/28/18) | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: examples of data munging and analysis in python | |
# *---------------------------------------------------------------- | |
# see also: https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_r.html | |
import pandas as pd # key library for manipulating data | |
import numpy as np # key library for numerical calculations | |
pd.set_option('display.float_format', '{:.2f}'.format) # suppress sci notation and limit to 2 decimals | |
### create a toy pandas data frame | |
# define a dictionary | |
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161], | |
'PIO':[160,150,146,138,142,146,150,154,158,162], | |
'MYC':[137,148,151,139,143,120,115,136,130,129], | |
'DEK':[150,149,145,140,144,148,152,156,160,164], | |
'WYF':[110,125,135,122,127,132,130,'NaN',147,119], | |
'PLOT':[1,2,3,4,5,6,7,8,9,10], | |
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'], | |
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'], | |
'ID':[1,2,3,4,5,6,7,8,9,10] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR']) | |
print(df) | |
### inspecting your data | |
df.columns | |
list(df.columns) # untruncated list | |
sorted(list(df.columns)) # sorted list | |
df.head() | |
df.tail() | |
df.info() | |
df.shape() | |
df.index | |
### misc things you can do to manipulate values and fields | |
df['ID'] = df['ID'].astype(str) # convert ID to string | |
df['PLOT'] = df['PLOT'].astype(str) # convert plot to string | |
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce', downcast ='integer') | |
### example working with data types | |
df['xvar'] = df['GARST'] # create a new colun 'xvar' as a copy of 'GARST' | |
df['xvar'] = df['xvar'].astype(str) # convert to string | |
# use pandas to convert back to numeric | |
df['xvar'] = pd.to_numeric(df['xvar'], errors='coerce') | |
# pad leading zeros | |
df['GARST0'] = df['GARST'].apply(lambda x:'{:0>6}'.format(x)) | |
df.head() # check | |
# create unique ID for each line in data frame | |
df['ID2'] = list(range(1,11)) | |
df.head() # check | |
# Print out type of GARST | |
print(type(df['GARST'])) | |
# Print out length of var1 | |
print(len(df['GARST'])) | |
# drop variables | |
df.drop(['xvar', 'ID2'], axis=1) | |
#----------------------------------- | |
# high level summary stats | |
#----------------------------------- | |
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161], | |
'PIO':[160,150,146,138,142,146,150,154,158,162], | |
'MYC':[137,148,151,139,143,120,115,136,130,129], | |
'DEK':[150,149,145,140,144,148,152,156,160,164], | |
'WYF':[110,125,135,122,127,132,130,'NaN',147,119], | |
'PLOT':[1,2,3,4,5,6,7,8,9,10], | |
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'], | |
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'], | |
'ID':[1,2,3,4,5,6,7,8,9,10] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR']) | |
print(df) | |
df.describe() # summary stats for all numerical colunms | |
df.GARST.mean() or df['GARST'].mean() | |
df.GARST.std() | |
df.GARST.var() | |
# Print the value counts for BT and RR | |
print(df['BT'].value_counts(dropna=False)) | |
print(df['RR'].value_counts(dropna=False)) | |
# Specify array of percentiles: percentiles | |
percentiles = [2.5,25,50,75,97.5] | |
# Compute percentiles: ptiles_vers | |
ptiles = np.percentile(df.GARST, percentiles) | |
# Print the result | |
print(ptiles) | |
#------------------------------------- | |
# data visualizatiion | |
#------------------------------------ | |
# Import necessary modules | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
### boxplot | |
# Create the boxplot | |
df.boxplot(column = 'GARST') | |
df.boxplot(column='GARST', by='RR', rot=90) | |
# Display the plot | |
plt.show() | |
### Box-and-whisker plot with seaborn | |
# Create box plot with Seaborn's default settings | |
_ = sns.boxplot(x='RR', y='GARST', data=df) | |
# Label the axes | |
_ = plt.xlabel('RR') | |
_ = plt.ylabel('yield') | |
# Show the plot | |
plt.show() | |
### histogram | |
# Plot histogram of versicolor petal lengths | |
_ = plt.hist(df.GARST) | |
# Label axes | |
_ = plt.xlabel('yield') | |
_ = plt.ylabel('count') | |
# Show histogram | |
plt.show() | |
### histogram with pandas | |
# Plot the PDF | |
df.GARST.plot( kind='hist', normed=True, bins=30, range=(100,200)) | |
plt.show() | |
# Plot the CDF | |
df.GARST.plot( kind='hist', normed=True, cumulative = True, bins=30, range=(100,200)) | |
plt.show() | |
### scatter plot | |
_ = plt.plot(df.GARST, df.PIO, marker='.', linestyle='none') | |
# Label the axes | |
_ = plt.xlabel('garst') | |
_ = plt.ylabel('pioneer') | |
# Show the result | |
plt.show() | |
#------------------------------------ | |
# sorting, subsetting and basic data frame manipulation | |
#------------------------------------ | |
# sort by GARST yield | |
df= df.sort_values('GARST', ascending=False) | |
print(df) | |
df= df.sort_values('GARST') | |
print(df) | |
# sort ascending by trait (Y/N) and by descending GARST yield_data | |
df = df.sort_values(['BT', 'GARST'], ascending=[True, False]) | |
print(df) | |
df = df.sort_values(['BT', 'GARST'], ascending=[False, False]) | |
print(df) | |
df = df.sort_values(['BT', 'GARST'], ascending=[False, True]) | |
print(df) | |
# sort by index to restore original order | |
df = df.sort_index() | |
print(df) | |
#--------------------------------------- | |
# misc | |
#---------------------------------------- | |
# Print out GARST column as Pandas Series | |
print(df['GARST']) | |
# Print out GARST column as Pandas DataFrame (use double brackets) | |
print(df[['GARST']]) | |
# Print out DataFrame with GARST and PLOT columns | |
print(df[['GARST','PLOT']]) | |
# subset data via variable selection | |
my_hybrids = df[['GARST','PIO']] | |
my_hybrids.head() # check | |
my_hybrids.info() # check | |
### example using .loc - this isn't always useful but sometimes | |
### I have used it in loops where you loop over data and modify specific fields | |
# Create the list of column labels: cols and get rows | |
cols = ['GARST','PIO','PLOT'] | |
rows = df.index # use the index from the original data fram (df) | |
# Create the new DataFrame | |
my_hybrids= df.loc[rows,cols] | |
print(my_hybrids) | |
# get only the first three rows | |
cols = ['GARST','PIO','PLOT'] | |
rows = [0,1,2] | |
my_hybrids= df.loc[rows,cols] | |
print(my_hybrids) | |
#------------------------------------------- | |
# creating flags and categorical variables | |
#----------------------------------------- | |
mask = df['GARST'] > 150 | |
# create a new data set that is filtered by the definition above | |
hi_garst = df[mask] | |
print(hi_garst) | |
# create a binary flag in original data | |
df['flag'] = np.where(df['GARST'] > 150, 1,0) | |
print(df) | |
### create categories based on value ranges | |
conditions = [ | |
(df['GARST'] < 140), | |
(df['GARST'] >= 140) & (df['GARST'] < 150), | |
(df['GARST'] >= 150)] | |
choices = ['low', 'med', 'high'] | |
df['lvl'] = np.select(conditions, choices, default='na') | |
print(df) | |
#-------------------------------------- | |
# group by operations | |
#-------------------------------------- | |
df.groupby('BT')['GARST'].mean() # average yield for garst BT | |
#------------------------------------------- | |
# joining tables | |
#------------------------------------------- | |
# | |
# create toy database tables | |
# | |
# create customer table | |
data = {'id':[1,2,3,4,5,6,7,8,9],'acres':[1800,1970,980,960,970,1500,700,2500,2980]} | |
customer = pd.DataFrame(data,columns =['id','acres']) | |
# create product table | |
data = {'id':[1,2,3,4,5,6,7,8,9],'type':['G8590','G8590','G8484','P8787','P8787','G8590','G8590','P8787','G8590']} | |
product = pd.DataFrame(data,columns =['id','type']) | |
# create tech table | |
data = {'type':['G8590','G8484','P8787'],'trait':['BT','RR','RW']} | |
tech = pd.DataFrame(data,columns =['type','trait']) | |
# create customer yield table | |
data = {'id':[1,2,3,4,5,6,7,8,9],'yield':[160,165,180,200,175,149,168,300,170]} | |
customer_yield = pd.DataFrame(data,columns =['id','yield']) | |
# | |
# example merges | |
# | |
temp1_add_product = pd.merge(customer,product[['type','id']], on='id', how='left') | |
temp2_add_tech = pd.merge(temp1_add_product,tech[['type','trait']], on='type', how='left') | |
temp3_add_yield = pd.merge(temp2_add_tech,customer_yield[['id','yield']], on='id', how='left') | |
# we could do all of these joins in one step | |
temp1_add_all = pd.merge(pd.merge(pd.merge(customer,product[['type','id']],on='id',how='left'), | |
tech[['type','trait']],on='type',how = 'left'), | |
customer_yield[['id','yield']], on='id', how='left') | |
#---------------------------------- | |
# basic regression | |
#---------------------------------- | |
# generate some data | |
data = {'id' :[1,2,3,4,5,6,7,8,9,10], | |
'treat' :[1,1,1,1,1,1,0,1,1,0,0,0,0,0], | |
'lifestyle':['H','H','H','H','H','H','H','N','N','N','N','N','N','N'], | |
'wtchg':[-12,-10,-9,-11,-12,-10,-8,-8,-2,5,8,10,-5,-2], | |
'calories':[2000,2000,3000,3000,2000,2000,2000,3000,3000,5000,5000,5000,3000,3000], | |
'push':[1,0,0,1,1,0,0,1,1,0,0,1,0,0], | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['treat','lifestyle','wtchg','calories','push']) | |
print(df) | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# raw comparisons | |
results = smf.ols('wtchg ~ treat', data=df).fit() # linear probability model | |
results.summary() | |
results.summary2() # it is not a problem here, but one of the problems with stats models | |
# I have read is you can't override scientific notation but summary2 | |
# will usually help with that | |
results.params # this gives you just the estimated co-efficients | |
# regression controlling for historical calorie intake | |
results = smf.ols('wtchg ~ treat + calories', data=df).fit() | |
results.summary() | |
results.params | |
# | |
# logistic regression | |
# | |
# simulate more data | |
df = pd.DataFrame(columns=['ID','treat','admit', 'miles']) | |
df['ID'] = list(range(1,501)) | |
df['treat'] = np.random.choice([0, 1], p=[0.5, 0.5], size=(500)) | |
df['admit'] = np.random.choice([0, 1], p=[0.7, 0.3], size=(500)) | |
df['miles'] = np.random.uniform(10, 150, df.shape[0]) | |
df.head() # check | |
df.describe() | |
# logistic regression | |
model = smf.glm('admit ~ treat + miles', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit)) | |
result = model.fit() | |
print(result.summary()) | |
#---------------------------------- | |
# working with duplicated data | |
#---------------------------------- | |
data = {'WT' :[150,148,145,200,198,196,191,175,175,161], | |
'HT':[72,72,72,68,68,68,68,69,69,71], | |
'EX': ['Y','Y','Y','Y','Y','Y','Y','N','N','N'], | |
'DIET':['N','N','N','Y','Y','Y','Y','N','N','N'], | |
'PLAN':['A','A','B','A','A','B','B','A','A','B'], | |
'ID':[1,1,1,2,2,2,2,3,3,4] | |
} | |
# convert to a data frame | |
tmp = pd.DataFrame(data,columns=['ID','WT','EX','DIET','PLAN']) | |
print(tmp) | |
# get unique members | |
tmp2 = tmp.drop_duplicates(['ID']) | |
print(tmp2) | |
# get unique id and plan combinations | |
tmp3 = tmp.drop_duplicates(['ID','PLAN']) | |
print(tmp3) | |
# identify duplicates (example) | |
tmp4 = tmp.groupby(['ID']).size().reset_index(name='count') # count duplicates | |
tmp4 = tmp4.sort_values(['count'], ascending=[False]) # sort | |
print(tmp4) # check | |
#--------------------------------------------- | |
# transposing or reshaping data | |
#--------------------------------------------- | |
# use melt to convert columns into rows | |
df2 = pd.melt(df, id_vars=['ID','BT','RR'], var_name='hybrid', value_name='yield') | |
print(df2) # check | |
df2.info() # check | |
df2['yield'] = pd.to_numeric(df2['yield'], errors='coerce') # fix loss of format | |
# pivot this data back into tidy form | |
df3 = df2.pivot_table(index = ['ID','BT','RR'],columns = 'hybrid',values= 'yield') | |
df3.head() # check | |
# convert indices back to columns | |
df3.reset_index(level=['ID','BT','RR']) | |
df3.reset_index() # this would have worked too | |
df3.info() | |
### example with toy panel data | |
# create toy panel (long) data | |
data = {'id' :[1,1,1,2,2,2,3,3,3], | |
'measure':["depth","temp","width","depth","temp","width","depth","temp","width"], | |
'values': [2,50,18,1.5,53,18,2.5,60,18], | |
} | |
# convert to a data frame | |
tmp = pd.DataFrame(data,columns=['id','measure','values']) | |
print(tmp) | |
# pivot this data back into tidy form (wide) | |
df1 = tmp.pivot_table(index = ['id'],columns = 'measure',values= 'values') | |
# convert indices back to columns | |
df1 = df1.reset_index() | |
print(df1) # check | |
# use melt to convert columns back into rows (panel or long) | |
df2 = pd.melt(df1, id_vars=['id'], var_name='measure', value_name='values') | |
print(df2) | |
#---------------------------------------- | |
# working with dates | |
#---------------------------------------- | |
### create data frame with date time index | |
data = {'date': ['2014-05-01 18:00:00', '2014-05-01 18:30:00', '2014-05-02 17:00:00', '2014-05-02 16:00:00', '2014-05-02 15:30:00', '2014-05-02 14:00:00', '2014-05-03 13:00:00', '2014-05-03 18:00:00', '2014-04-30 15:00:00', '2014-04-30 18:00:00'], | |
'aphids': [15, 25, 26, 12, 17, 14, 26, 32, 48, 41]} | |
df = pd.DataFrame(data, columns = ['date', 'aphids']) | |
print(df) | |
df.info() | |
# Convert df['date'] from string to datetime | |
# Prepare a format string: time_format | |
time_format = '%Y-%m-%d %H:%M' | |
# Convert date_list into a datetime object: my_datetimes | |
df['date'] = pd.to_datetime(df['date'], format=time_format) | |
df.info() | |
# function for parsing calendar date | |
import datetime as dt | |
def get_day(x): return dt.datetime(x.year, x.month, x.day) | |
# Create day column | |
df['day'] = df['date'].apply(get_day) | |
df.head() | |
# Set df['date'] as the index and delete the column | |
df.index = df['date'] | |
del df['date'] # since its an index now delete the column | |
df # check | |
df.info() # check | |
### date operations | |
# Extract data from 2pm to 4pm on '2014-05-02': ts1 | |
ts1 = df.loc['2014-05-02 14:00:00':'2014-05-02 16:00:00'] | |
print(ts1) | |
# Extract '2014-05-02': ts2 | |
ts2 = df.loc['2014-05-02'] | |
print(ts2) | |
# Extract data from '2014-05-03' to '22014-05-02': ts3 | |
ts3 = df.loc['2014-05-03':'2014-05-05'] | |
print(ts3) | |
# Downsample to get total within 2 hours | |
df1 = df['aphids'].resample('2h').sum() | |
print(df) # compare | |
print(df1) # check | |
# Downsample to get daily total aphid counts | |
df1 = df['aphids'].resample('D').sum() | |
print(df) # compare | |
print(df1) | |
# get daily high counts | |
daily_highs = df['aphids'].resample('D').max() | |
print(daily_highs) | |
# get counts for april | |
april = df['aphids']['2014-Apr'] | |
print(april) | |
#-------------------------------------- | |
# missing data | |
#-------------------------------------- | |
data = {'GARST' :[150,'Nan',145,137,'Nan',145,149,'Nan',157,161], | |
'PIO':[160,150,146,138,142,146,150,154,158,162], | |
'MYC':[137,148,151,139,143,'Nan',115,136,130,129], | |
'DEK':[150,149,145,140,144,148,152,156,160,164], | |
'WYF':[110,125,135,122,'Nan',132,130,'NaN',147,119], | |
'PLOT':[1,2,3,4,5,6,7,8,9,10], | |
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'], | |
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'], | |
'ID':[1,2,3,4,5,6,7,8,9,10] | |
} | |
# convert to a data frame | |
df = pd.DataFrame(data,columns=['ID','GARST','PIO','MYC','DEK','WYF','PLOT','BT','RR']) | |
print(df) | |
df.info() | |
# cleanup | |
df['GARST'] = pd.to_numeric(df['GARST'], errors='coerce') | |
df['WYF'] = pd.to_numeric(df['WYF'], errors='coerce') | |
df['MYC'] = pd.to_numeric(df['MYC'], errors='coerce') | |
df.describe() # check out our toy data | |
df.isnull().sum(axis=0) # count of missing values by variable in dataframe | |
#------------------------------------ | |
# loops and functions | |
#----------------------------------- | |
### loop over a list | |
# ex | |
garst= [150,140,145,137,141] | |
for yields in garst : | |
print(yields) | |
### basic loop | |
x = 1 | |
while x < 4 : | |
print(x) | |
x = x + 1 | |
# define function | |
def square(num): | |
new_value = num ** 2 | |
return new_value | |
square(5) # call function | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment