Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created February 22, 2017 02:32
Show Gist options
  • Save BioSciEconomist/904ace29e204ca5308ade5ed0cd68920 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/904ace29e204ca5308ade5ed0cd68920 to your computer and use it in GitHub Desktop.
Basic Statistics in Python
#------------------------------------------------------------------
# PROGRAM NAME: python basic stats.py
# DATE: 2/20/16
# CREATED BY: MATT BOGARD
# PROJECT FILE:
#----------------------------------------------------------------
# PURPOSE: BASIC STATS IN PYTHON
#---------------------------------------------------------------
import pandas as pd #primary package for data manipulation
# make a data frame manually
data = {'GARST' :[150,140,145,137,141,145,149,153,157,161],
'PIO':[160,150,146,138,142,146,150,154,158,162],
'MYC':[137,148,151,139,143,120,115,136,130,129],
'DEK':[150,149,145,140,144,148,152,156,160,164],
'PLOT':[1,2,3,4,5,6,7,8,9,10],
'BT': ['Y','Y', 'N','N','N','N','Y','N','Y','Y'],
'RR':['Y','N','Y','N','N','N','N','Y','Y','N'],
}
yield_data = pd.DataFrame(data,columns=['GARST','PIO','MYC','DEK','PLOT','BT','RR'])
# create gmo trait field
# set a default value
yield_data['GMO'] = 'Non-GMO ';
# check for RR traits
yield_data['GMO'][yield_data['RR']== 'Y']= 'Single Trait '
# check for BT traits
yield_data['GMO'][yield_data['BT']== 'Y']= 'Single Trait '
# check for double stacked traits
yield_data['GMO'][(yield_data['BT']== 'Y') & (yield_data['RR']=='Y')]= 'Stacked Trait'
#----------------------------------
# descriptive statistics
#----------------------------------
#from pandas
#descriptives
yield_data.describe()
# descriptives by group
groupby_gmo = yield_data.groupby('GMO')
groupby_gmo.mean()
#frequencies
yield_data.GMO.value_counts().sort_index()
#------------------------------
# graphics
#------------------------------
import numpy as np
import matplotlib.pyplot as plt
hist(yield_data['GARST'],bins= 3,color='green')
yield_data.plot('GARST', 'PIO', kind = 'scatter')
#-----------------------------------
# t-tests
#-----------------------------------
from scipy import stats
stats.ttest_ind(yield_data.GARST,yield_data.PIO) # output gives t-stat and pval
#welch's 2 sample (unequal var)
stats.ttest_ind(yield_data.GARST,yield_data.PIO, equal_var = False)
#run as if this were a paired ttest
stats.ttest_rel(yield_data.GARST,yield_data.PIO)
#-----------------------------------
# correlations
#-----------------------------------
# from pandas
print( yield_data[['GARST','PIO']].corr())
#-----------------------------------
# regression
#-----------------------------------
import statsmodels.api as sm
import statsmodels.formula.api as smf
# regression using smf
results = smf.ols('GARST ~ PIO', data=yield_data).fit()
# inspect results
print results.summary()
# regression using sm
X = yield_data[['PIO']]
y = yield_data[['GARST']]
X.head()
y.head()
## fit a OLS model with intercept sales and income
X = sm.add_constant(X)
X.head()
est = sm.OLS(y, X).fit()
est.summary()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment