gsampath127 · October 14, 2019 17:35
diff --git a/chi.py b/chi.py
 #!/usr/bin/env python
 # coding: utf-8

 # ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank)  . Here I am considering only few columns to make things clear

 # ### Import libraries

 # In[2]:


 import numpy as numpy
 import pandas as pd
 import seaborn as sns
 from sklearn.preprocessing import LabelEncoder


 # ### Get the data

 # In[6]:


 churn_df = pd.read_csv('bank.csv')


 # In[7]:


 churn_df.head()


 # ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

 # ## Before performig Ch-Square test we have to make sure data is label encoded.

 # In[9]:


 label_encoder = LabelEncoder()
 churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
 churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


 # In[11]:


 churn_df.head()


 # ## Chi-Square test 

 # In[13]:


 from sklearn.feature_selection import chi2


 # In[14]:


 X = churn_df.drop('Exited',axis=1)
 y = churn_df['Exited']


 # In[15]:


 chi_scores = chi2(X,y)


 # In[16]:


 chi_scores


 # ### here first array represents chi square values and second array represnts p-values

 # In[17]:


 p_values = pd.Series(chi_scores[1],index = X.columns)
 p_values.sort_values(ascending = False , inplace = True)


 # In[19]:


 p_values.plot.bar()


 # ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

 # In[ ]:




diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/env python
 # coding: utf-8

 # ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank)  . Here I am considering only few columns to make things clear

 # ### Import libraries

 # In[2]:


 import numpy as numpy
 import pandas as pd
 import seaborn as sns
 from sklearn.preprocessing import LabelEncoder


 # ### Get the data

 # In[6]:


 churn_df = pd.read_csv('bank.csv')


 # In[7]:


 churn_df.head()


 # ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

 # ## Before performig Ch-Square test we have to make sure data is label encoded.

 # In[9]:


 label_encoder = LabelEncoder()
 churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
 churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


 # In[11]:


 churn_df.head()


 # ## Chi-Square test 

 # In[13]:


 from sklearn.feature_selection import chi2


 # In[14]:


 X = churn_df.drop('Exited',axis=1)
 y = churn_df['Exited']


 # In[15]:


 chi_scores = chi2(X,y)


 # In[16]:


 chi_scores


 # ### here first array represents chi square values and second array represnts p-values

 # In[17]:


 p_values = pd.Series(chi_scores[1],index = X.columns)
 p_values.sort_values(ascending = False , inplace = True)


 # In[19]:


 p_values.plot.bar()


 # ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

 # In[ ]:
	#!/usr/bin/env python
	# coding: utf-8

	# ## Perform Chi-Square test for Bank Churn prediction (find out different patterns on customer leaves the bank) . Here I am considering only few columns to make things clear

	# ### Import libraries

	# In[2]:


	import numpy as numpy
	import pandas as pd
	import seaborn as sns
	from sklearn.preprocessing import LabelEncoder


	# ### Get the data

	# In[6]:


	churn_df = pd.read_csv('bank.csv')


	# In[7]:


	churn_df.head()


	# ### Here we have 4 category predictors and one category response. Exited, the response column represnts customer left the bank or not.

	# ## Before performig Ch-Square test we have to make sure data is label encoded.

	# In[9]:


	label_encoder = LabelEncoder()
	churn_df['Geography'] = label_encoder.fit_transform(churn_df['Geography'])
	churn_df['Gender'] = label_encoder.fit_transform(churn_df['Gender'])


	# In[11]:


	churn_df.head()


	# ## Chi-Square test

	# In[13]:


	from sklearn.feature_selection import chi2


	# In[14]:


	X = churn_df.drop('Exited',axis=1)
	y = churn_df['Exited']


	# In[15]:


	chi_scores = chi2(X,y)


	# In[16]:


	chi_scores


	# ### here first array represents chi square values and second array represnts p-values

	# In[17]:


	p_values = pd.Series(chi_scores[1],index = X.columns)
	p_values.sort_values(ascending = False , inplace = True)


	# In[19]:


	p_values.plot.bar()


	# ### Since HasCrCard has higher the p-value, it says that this variables is independent of the repsone and can not be considered for model training

	# In[ ]: