Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Last active February 20, 2021 14:02
Show Gist options
  • Save ksv-muralidhar/15d249f093c222c05014f24dff69d323 to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/15d249f093c222c05014f24dff69d323 to your computer and use it in GitHub Desktop.
kaggle survey 2020
import numpy as np
import pandas as pd
from apyori import apriori
from termcolor import colored
data = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv",header=1)
#Excluding Columns having 'None' and 'Other' as responses. Also excluding questions about social media, courses and future plans.
product_data = data.loc[:, pd.Series(data.columns)[(pd.Series(data.columns).str.contains('- Selected Choice -',regex=False)) & \
(pd.Series(data.columns).str.contains('(None$)|(Other$)|activities|courses|(media sources)|(you hope)',regex=True)==False)]].copy()
#Identifying column names having duplicates. For example MATLAB is listed under IDEs and Languages
dupes = pd.DataFrame()
for i in product_data.columns:
dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))
dupes[dupes.duplicated(subset="Duplicate")]
#Identifying column numbers having duplicates.
dupes = pd.DataFrame()
for i in product_data.columns:
dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))
dupes[dupes.duplicated(subset="Duplicate")]
product_data.iloc[:,20] = product_data.iloc[:,20].str.replace("MATLAB","MATLAB IDE",regex=False)
product_data.iloc[:,168] = product_data.iloc[:,168].str.replace("Shiny","Shiny (Publicly share)",regex=False)
# Converting the data frame of transcations to list of lists of transcations
product_list = list()
for t in range(len(product_data)):
product_list.append([str(product_data.values[t,c]) for c in range(product_data.shape[1])])
print(product_list[0])
MIN_SUPPORT = 0.02
MIN_CONFIDENCE = 0.6
MIN_LIFT = 3
MAX_LENGTH = 2
rules = apriori(product_list,min_support=MIN_SUPPORT,min_confidence=MIN_CONFIDENCE,min_lift=MIN_LIFT,max_length=MAX_LENGTH)
rules = list(rules)
rules_df = pd.DataFrame()
for i in range(len(rules)):
rules_df = rules_df.append(pd.DataFrame({"Antecedent": " AND ".join(list(rules[i][2][0][0])),
"Consequent": list(rules[i][2][0][1])[0],
"Support": np.round(float(list(rules[i])[1]),3),
"Confidence": np.round(float(list(rules[i])[2][0][2]),3),
"Lift": np.round(float(list(rules[i])[2][0][3]),3)},index=[i]))
rules_df = rules_df.loc[(rules_df["Antecedent"].str.contains("( nan)|(nan )",regex=True)==False) & \
(rules_df["Consequent"] != "nan")].sort_values(by=["Lift","Confidence"],ascending=[False,False]).drop_duplicates().reset_index(drop=True).copy()
rules_df.head()
rules_df.sort_values(by='Confidence',ascending=False,inplace = True)
n = 1
for i in range(len(rules_df)):
if (rules_df.iloc[i,3]*100) < 80:
break
print(f'{n}. {colored(np.round(rules_df.iloc[i,3]*100,1),"grey","on_cyan")}{colored("%","grey","on_cyan")} of the respondents who use {colored(rules_df.iloc[i,0],"grey","on_cyan")} also use {colored(rules_df.iloc[i,1],"grey","on_cyan")}\n')
n += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment