panicpotatoe · May 21, 2019 17:59
diff --git a/chi_square_example_01.py b/chi_square_example_01.py
 # -*- coding: utf-8 -*-
 """
 Created on Mon Mar 11 10:57:18 2019

 @author: Nhan Tran
 """

 import numpy as np
 import pandas as pd
 import scipy.stats as stats

 """ METHODOLOGY 01: MANUAL CALCULATION """
 # STEP 1: GENERATE A RANDOM DATASET
 # Generate under a random factor
 # https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.seed.html
 np.random.seed(10)

 # Sample data randomly at fixed probabilities
 voter_race = np.random.choice(a=["asian","black","hispanic","other","white"],
                              p=[0.05, 0.15 ,0.25, 0.05, 0.5],
                              size=1000)

 # Sample data randomly at fixed probabilities
 voter_party = np.random.choice(a=["democrat","independent","republican"],
                              p=[0.4, 0.2, 0.4],
                              size=1000)

 # Binding 2 arrays (voter_race and voter_party) to make a DataFrame
 voters = pd.DataFrame({"race":voter_race, 
                       "party":voter_party})
 # You can check the data of DataFrame by calling it
 voters

 # Create a CrossTab from DataFrame, Assign the column names and row names
 voter_tab = pd.crosstab(voters.race, voters.party, margins=True)
 voter_tab.columns = ["democrat", "independent", "republican", "row_totals"]
 voter_tab.index = ["asian", "black", "hispanic", "other", "white", "col_totals"]
 # You can check the data of CrossTab by calling it
 voter_tab

 # STEP 2: GET THE "OBSERVED" TABLE AND "EXPECTED" TABLE
 """
    Calculate the "observed" table:
    "Observed" table can be extracted from our CrossTab by exclude the row_totals and col_totals
    You can see row_totals is in the index of 4 (in column)
        and col_totals is in the index of 6 (in row).
        So [0:5, 0:3] means "we will take the rows from 0 index to 5 index
        and columns from 0 index to 3 index and assign to new CrossTab
        that named [observed]"
 """
 observed = voter_tab.iloc[0:5, 0:3]
 # You can check the data of observed table by calling it
 observed

 """
    Calculate the "expected" table:
    "Expected" table can be calculated using below formula:
        total_rows x total_columns / total_observations
    And these factors can be get by:
        - total_rows = voter_tab["row_totals"]
        - total_columns = voter_tab["col_totals"]
        - total_observations = 1000
    Please note that the "loc" function in below code is used to switch the
        index base on column name to row name
 """
 expected =  np.outer(voter_tab["row_totals"][0:5],
                     voter_tab.loc["col_totals"][0:3]) / 1000
 # Now convert into a DataFrame, Assign the column names and row names
 expected = pd.DataFrame(expected)
 expected.columns = ["democrat", "independent", "republican"]
 expected.index = ["asian", "black", "hispanic", "other", "white"]
 # You can check the data of expected table by calling it
 expected

 # STEP 3: CALCULATE THE CHI SQUARE VALUE and CRITICAL VALUE
 """
    Chi square formula:
        chi square = total of [(observed - expected)^2]/expected
    Note: We call .sum() twice: once to get the column sums
        and a second time to add the column sums together,
        returning the sum of the entire 2D table.
 """
 chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
 print(chi_squared_stat)

 """
    Find the critical value for confidence of 95% and degree of freedom (df) of 8
    Why df = 8?
    Degree of freedom formula:
        df = (total rows - 1) x (total columns - 1)
           = (5 - 1) x (3 - 1)
           = 4 x 2
           = 8
 """
 crit = stats.chi2.ppf(q=0.95, df=8)
 print(crit)

 # Calculate p-value (just additional calculation)
 p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, df=8)
 print(p_value)

 # STEP 4: MAKE THE CONCLUSION
 # Because chi_squared_stat < crit
 # We reject the hypocrite which means two factors are INDEPENDENT together

 """ METHODOLOGY 02: CALCULATE USING SCIPY.STATS LIBRARY"""
 stats = stats.chi2_contingency(observed=observed)
 # You can check the returned data by calling it
 # The returned data includes: chi_squared_stat, p_value, df, expected_crosstab
 print(stats)
	# -- coding: utf-8 --
	"""
	Created on Mon Mar 11 10:57:18 2019

	@author: Nhan Tran
	"""

	import numpy as np
	import pandas as pd
	import scipy.stats as stats

	""" METHODOLOGY 01: MANUAL CALCULATION """
	# STEP 1: GENERATE A RANDOM DATASET
	# Generate under a random factor
	# https://docs.scipy.org/doc/numpy/reference/generated/numpy.random.seed.html
	np.random.seed(10)

	# Sample data randomly at fixed probabilities
	voter_race = np.random.choice(a=["asian","black","hispanic","other","white"],
	p=[0.05, 0.15 ,0.25, 0.05, 0.5],
	size=1000)

	# Sample data randomly at fixed probabilities
	voter_party = np.random.choice(a=["democrat","independent","republican"],
	p=[0.4, 0.2, 0.4],
	size=1000)

	# Binding 2 arrays (voter_race and voter_party) to make a DataFrame
	voters = pd.DataFrame({"race":voter_race,
	"party":voter_party})
	# You can check the data of DataFrame by calling it
	voters

	# Create a CrossTab from DataFrame, Assign the column names and row names
	voter_tab = pd.crosstab(voters.race, voters.party, margins=True)
	voter_tab.columns = ["democrat", "independent", "republican", "row_totals"]
	voter_tab.index = ["asian", "black", "hispanic", "other", "white", "col_totals"]
	# You can check the data of CrossTab by calling it
	voter_tab

	# STEP 2: GET THE "OBSERVED" TABLE AND "EXPECTED" TABLE
	"""
	Calculate the "observed" table:
	"Observed" table can be extracted from our CrossTab by exclude the row_totals and col_totals
	You can see row_totals is in the index of 4 (in column)
	and col_totals is in the index of 6 (in row).
	So [0:5, 0:3] means "we will take the rows from 0 index to 5 index
	and columns from 0 index to 3 index and assign to new CrossTab
	that named [observed]"
	"""
	observed = voter_tab.iloc[0:5, 0:3]
	# You can check the data of observed table by calling it
	observed

	"""
	Calculate the "expected" table:
	"Expected" table can be calculated using below formula:
	total_rows x total_columns / total_observations
	And these factors can be get by:
	- total_rows = voter_tab["row_totals"]
	- total_columns = voter_tab["col_totals"]
	- total_observations = 1000
	Please note that the "loc" function in below code is used to switch the
	index base on column name to row name
	"""
	expected = np.outer(voter_tab["row_totals"][0:5],
	voter_tab.loc["col_totals"][0:3]) / 1000
	# Now convert into a DataFrame, Assign the column names and row names
	expected = pd.DataFrame(expected)
	expected.columns = ["democrat", "independent", "republican"]
	expected.index = ["asian", "black", "hispanic", "other", "white"]
	# You can check the data of expected table by calling it
	expected

	# STEP 3: CALCULATE THE CHI SQUARE VALUE and CRITICAL VALUE
	"""
	Chi square formula:
	chi square = total of [(observed - expected)^2]/expected
	Note: We call .sum() twice: once to get the column sums
	and a second time to add the column sums together,
	returning the sum of the entire 2D table.
	"""
	chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()
	print(chi_squared_stat)

	"""
	Find the critical value for confidence of 95% and degree of freedom (df) of 8
	Why df = 8?
	Degree of freedom formula:
	df = (total rows - 1) x (total columns - 1)
	= (5 - 1) x (3 - 1)
	= 4 x 2
	= 8
	"""
	crit = stats.chi2.ppf(q=0.95, df=8)
	print(crit)

	# Calculate p-value (just additional calculation)
	p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, df=8)
	print(p_value)

	# STEP 4: MAKE THE CONCLUSION
	# Because chi_squared_stat < crit
	# We reject the hypocrite which means two factors are INDEPENDENT together

	""" METHODOLOGY 02: CALCULATE USING SCIPY.STATS LIBRARY"""
	stats = stats.chi2_contingency(observed=observed)
	# You can check the returned data by calling it
	# The returned data includes: chi_squared_stat, p_value, df, expected_crosstab
	print(stats)
No results found