ksv-muralidhar · February 20, 2021 14:02
diff --git a/k1.py b/k1.py
 import numpy as np
 import pandas as pd
 from apyori import apriori
 from termcolor import colored

 data = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv",header=1)
diff --git a/k2.py b/k2.py
 #Excluding Columns having 'None' and 'Other' as responses. Also excluding questions about social media, courses and future plans.
 product_data = data.loc[:, pd.Series(data.columns)[(pd.Series(data.columns).str.contains('- Selected Choice -',regex=False)) & \
                           (pd.Series(data.columns).str.contains('(None$)|(Other$)|activities|courses|(media sources)|(you hope)',regex=True)==False)]].copy()
diff --git a/k3.py b/k3.py
 #Identifying column names having duplicates. For example MATLAB is listed under IDEs and Languages
 dupes = pd.DataFrame()
 for i in product_data.columns:
    dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))
    
 dupes[dupes.duplicated(subset="Duplicate")]


 #Identifying column numbers having duplicates.
 dupes = pd.DataFrame()
 for i in product_data.columns:
    dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))
    
 dupes[dupes.duplicated(subset="Duplicate")]

 product_data.iloc[:,20] = product_data.iloc[:,20].str.replace("MATLAB","MATLAB IDE",regex=False)
 product_data.iloc[:,168] = product_data.iloc[:,168].str.replace("Shiny","Shiny (Publicly share)",regex=False)
diff --git a/k5.py b/k5.py
 # Converting the data frame of transcations to list of lists of transcations
 product_list = list()
 for t in range(len(product_data)):
    product_list.append([str(product_data.values[t,c]) for c in range(product_data.shape[1])])

 print(product_list[0])
diff --git a/k6.py b/k6.py
 MIN_SUPPORT = 0.02
 MIN_CONFIDENCE = 0.6
 MIN_LIFT = 3
 MAX_LENGTH = 2

 rules = apriori(product_list,min_support=MIN_SUPPORT,min_confidence=MIN_CONFIDENCE,min_lift=MIN_LIFT,max_length=MAX_LENGTH)
 rules = list(rules)

 rules_df = pd.DataFrame()
 for i in range(len(rules)):
    rules_df = rules_df.append(pd.DataFrame({"Antecedent": "  AND  ".join(list(rules[i][2][0][0])),
                               "Consequent": list(rules[i][2][0][1])[0],
                               "Support": np.round(float(list(rules[i])[1]),3),
                               "Confidence": np.round(float(list(rules[i])[2][0][2]),3),
                               "Lift": np.round(float(list(rules[i])[2][0][3]),3)},index=[i]))
    
    
 rules_df = rules_df.loc[(rules_df["Antecedent"].str.contains("( nan)|(nan )",regex=True)==False) & \
                       (rules_df["Consequent"] != "nan")].sort_values(by=["Lift","Confidence"],ascending=[False,False]).drop_duplicates().reset_index(drop=True).copy()

 rules_df.head()
diff --git a/k7.py b/k7.py
 rules_df.sort_values(by='Confidence',ascending=False,inplace = True)
 n = 1
 for i in range(len(rules_df)):
    if (rules_df.iloc[i,3]*100) < 80:
        break
    print(f'{n}. {colored(np.round(rules_df.iloc[i,3]*100,1),"grey","on_cyan")}{colored("%","grey","on_cyan")} of the respondents who use {colored(rules_df.iloc[i,0],"grey","on_cyan")} also use {colored(rules_df.iloc[i,1],"grey","on_cyan")}\n')
    n += 1
	import numpy as np
	import pandas as pd
	from apyori import apriori
	from termcolor import colored

	data = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv",header=1)
	#Excluding Columns having 'None' and 'Other' as responses. Also excluding questions about social media, courses and future plans.
	product_data = data.loc[:, pd.Series(data.columns)[(pd.Series(data.columns).str.contains('- Selected Choice -',regex=False)) & \
	(pd.Series(data.columns).str.contains('(None$)\|(Other$)\|activities\|courses\|(media sources)\|(you hope)',regex=True)==False)]].copy()
	#Identifying column names having duplicates. For example MATLAB is listed under IDEs and Languages
	dupes = pd.DataFrame()
	for i in product_data.columns:
	dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))

	dupes[dupes.duplicated(subset="Duplicate")]


	#Identifying column numbers having duplicates.
	dupes = pd.DataFrame()
	for i in product_data.columns:
	dupes = dupes.append(pd.DataFrame({"Duplicate":pd.Series(product_data.loc[:,i].dropna().unique()).apply(lambda x: x.strip()),"Column Name":i}))

	dupes[dupes.duplicated(subset="Duplicate")]

	product_data.iloc[:,20] = product_data.iloc[:,20].str.replace("MATLAB","MATLAB IDE",regex=False)
	product_data.iloc[:,168] = product_data.iloc[:,168].str.replace("Shiny","Shiny (Publicly share)",regex=False)
	# Converting the data frame of transcations to list of lists of transcations
	product_list = list()
	for t in range(len(product_data)):
	product_list.append([str(product_data.values[t,c]) for c in range(product_data.shape[1])])

	print(product_list[0])
	MIN_SUPPORT = 0.02
	MIN_CONFIDENCE = 0.6
	MIN_LIFT = 3
	MAX_LENGTH = 2

	rules = apriori(product_list,min_support=MIN_SUPPORT,min_confidence=MIN_CONFIDENCE,min_lift=MIN_LIFT,max_length=MAX_LENGTH)
	rules = list(rules)

	rules_df = pd.DataFrame()
	for i in range(len(rules)):
	rules_df = rules_df.append(pd.DataFrame({"Antecedent": " AND ".join(list(rules[i][2][0][0])),
	"Consequent": list(rules[i][2][0][1])[0],
	"Support": np.round(float(list(rules[i])[1]),3),
	"Confidence": np.round(float(list(rules[i])[2][0][2]),3),
	"Lift": np.round(float(list(rules[i])[2][0][3]),3)},index=[i]))


	rules_df = rules_df.loc[(rules_df["Antecedent"].str.contains("( nan)\|(nan )",regex=True)==False) & \
	(rules_df["Consequent"] != "nan")].sort_values(by=["Lift","Confidence"],ascending=[False,False]).drop_duplicates().reset_index(drop=True).copy()

	rules_df.head()
	rules_df.sort_values(by='Confidence',ascending=False,inplace = True)
	n = 1
	for i in range(len(rules_df)):
	if (rules_df.iloc[i,3]*100) < 80:
	break
	print(f'{n}. {colored(np.round(rules_df.iloc[i,3]*100,1),"grey","on_cyan")}{colored("%","grey","on_cyan")} of the respondents who use {colored(rules_df.iloc[i,0],"grey","on_cyan")} also use {colored(rules_df.iloc[i,1],"grey","on_cyan")}\n')
	n += 1