Last active
February 20, 2021 14:02
-
-
Save ksv-muralidhar/15d249f093c222c05014f24dff69d323 to your computer and use it in GitHub Desktop.
kaggle survey 2020
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from apyori import apriori | |
from termcolor import colored | |
data = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv",header=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Excluding Columns having 'None' and 'Other' as responses. Also excluding questions about social media, courses and future plans. | |
product_data = data.loc[:, pd.Series(data.columns)[(pd.Series(data.columns).str.contains('- Selected Choice -',regex=False)) & \ | |
(pd.Series(data.columns).str.contains('(None$)|(Other$)|activities|courses|(media sources)|(you hope)',regex=True)==False)]].copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Identifying column names having duplicates. For example MATLAB is listed under IDEs and Languages | |
dupes = pd.DataFrame() | |
for i in product_data.columns: | |
dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i})) | |
dupes[dupes.duplicated(subset="Duplicate")] | |
#Identifying column numbers having duplicates. | |
dupes = pd.DataFrame() | |
for i in product_data.columns: | |
dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i})) | |
dupes[dupes.duplicated(subset="Duplicate")] | |
product_data.iloc[:,20] = product_data.iloc[:,20].str.replace("MATLAB","MATLAB IDE",regex=False) | |
product_data.iloc[:,168] = product_data.iloc[:,168].str.replace("Shiny","Shiny (Publicly share)",regex=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Converting the data frame of transcations to list of lists of transcations | |
product_list = list() | |
for t in range(len(product_data)): | |
product_list.append([str(product_data.values[t,c]) for c in range(product_data.shape[1])]) | |
print(product_list[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MIN_SUPPORT = 0.02 | |
MIN_CONFIDENCE = 0.6 | |
MIN_LIFT = 3 | |
MAX_LENGTH = 2 | |
rules = apriori(product_list,min_support=MIN_SUPPORT,min_confidence=MIN_CONFIDENCE,min_lift=MIN_LIFT,max_length=MAX_LENGTH) | |
rules = list(rules) | |
rules_df = pd.DataFrame() | |
for i in range(len(rules)): | |
rules_df = rules_df.append(pd.DataFrame({"Antecedent": " AND ".join(list(rules[i][2][0][0])), | |
"Consequent": list(rules[i][2][0][1])[0], | |
"Support": np.round(float(list(rules[i])[1]),3), | |
"Confidence": np.round(float(list(rules[i])[2][0][2]),3), | |
"Lift": np.round(float(list(rules[i])[2][0][3]),3)},index=[i])) | |
rules_df = rules_df.loc[(rules_df["Antecedent"].str.contains("( nan)|(nan )",regex=True)==False) & \ | |
(rules_df["Consequent"] != "nan")].sort_values(by=["Lift","Confidence"],ascending=[False,False]).drop_duplicates().reset_index(drop=True).copy() | |
rules_df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rules_df.sort_values(by='Confidence',ascending=False,inplace = True) | |
n = 1 | |
for i in range(len(rules_df)): | |
if (rules_df.iloc[i,3]*100) < 80: | |
break | |
print(f'{n}. {colored(np.round(rules_df.iloc[i,3]*100,1),"grey","on_cyan")}{colored("%","grey","on_cyan")} of the respondents who use {colored(rules_df.iloc[i,0],"grey","on_cyan")} also use {colored(rules_df.iloc[i,1],"grey","on_cyan")}\n') | |
n += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment