Skip to content

Instantly share code, notes, and snippets.

@arditobryan
Created May 4, 2020 23:06
Show Gist options
  • Save arditobryan/01fb3fcbb126230c2929ff840f3dfe48 to your computer and use it in GitHub Desktop.
Save arditobryan/01fb3fcbb126230c2929ff840f3dfe48 to your computer and use it in GitHub Desktop.
Harry Potter Sorting Hat
#Generating the Data
#convert boxplots to mean, sd, alpha
def features_creator(boxplots):
def boxplot_to_normal(Q1, Q2, Q3):
sd = (Q3-Q1)*(3/4)
mean = (Q1+(Q3-Q1)/2) #/(Q3-Q1)
alpha = ((Q3-Q2)/(Q2-Q1)) #/(Q3-Q1)
return mean, sd, alpha
features = boxplots
for i in range(len(boxplots)):
mean, sd, alpha = boxplot_to_normal(boxplots[i][0], boxplots[i][1], boxplots[i][2]) #Q1, Q2, Q3
features[i][0] = mean
features[i][1] = sd
features[i][2] = alpha
return features
#Boxplots con reference value at .4, variation for .1 range are 206 pixels
boxplots = [ #Q1, Q2, Q3 in pixels
[-64, 21, 85], [-167, -83, -19], [-311, -206, -60], [-144, 22, 85], [-211, -186, -81], #Griffindor: O, C, E, A, N
[-41, 42, 105], [-147, -83, -19], [-373, -228, -101], [-106, 22, 85], [-333, -206, -81], #Hufflepuff: O, C, E, A, N
[-64, 21, 85], [-147, -83, -19], [-351, -228, -123], [-44, 41, 126], [-333, -227, -101], #Ravenclow: O, C, E, A, N
[-188, -83, 23], [-188, -123, -60], [-373, -228, -141], [-64, 21, 85], [-352, -227, -101], #Slytherin: O, C, E, A, N
]
#conversion pixels to real numbers on a scale [0, .5]
for i in range(len(boxplots)):
for a in range(len(boxplots[i])):
boxplots[i][a] = (4+(boxplots[i][a]/206))/10
boxplots_copy = [x[:] for x in boxplots] #duplicate list
features = features_creator(boxplots) #conversion from boxplots (with Q1, Q2, Q3) to features (means, sd, alpha)
features
#we create the distributions based the means and standard deviations
import pandas as pd
import numpy as np
from scipy.stats import skewnorm
def algo(features, classes, size):
def calc(x, sd, mean):
return (x*sd)+mean
all_norm = list()
for values in features:
k = skewnorm.rvs(values[2], size=size)
k = calc(k, values[1], values[0])
all_norm.append(k)
#all_norm.append(np.random.normal(values[0], values[1], size)) #A, B, C, D, E, F, G, H...
all_norm_copy = all_norm.copy() #facciamo una copia della lista, quando la grafiamo con la normal distribution allora è piena: grafando un df si ottengono linee
feature_n = len(features) #8
classes_n = len(classes) #2
column_n = feature_n/classes_n #4
X = pd.DataFrame()
for i in range(len(all_norm)):
all_norm[i] = pd.DataFrame(all_norm[i])
col = pd.DataFrame()
for c in range(int(classes_n)): #per ogni colonna
beginning = int(c*column_n) #se in caso ci saranno dei problemi sono qui
end = int(beginning+column_n)
col = pd.concat(all_norm[beginning:end], axis=1)
X = pd.concat([X, col], axis=0)
y = pd.DataFrame([0]*(size*int(classes_n)))
for m in range(classes_n):
beginning = int(m*size)
end = int((beginning+size)*int(classes_n))
y[beginning:end] = classes[m]
return X, y, all_norm_copy
names = [
['Openness, Griffindor'], ['Conscientiousness, Griffindor'], ['Extroversion, Griffindor'], ['Agreebleness, Griffindor'], ['Neuroticism, Griffindor'], #Griffindor: O, C, E, A, N
['Openness, Hufflepuff'], ['Conscientiousness, Hufflepuff'], ['Extroversion, Hufflepuff'], ['Agreebleness, Hufflepuff'], ['Neuroticism, Hufflepuff'], #Hufflepuff: O, C, E, A, N
['Openness, Ravenclow'], ['Conscientiousness, Ravenclow'], ['Extroversion, Ravenclow'], ['Agreebleness, Ravenclow'], ['Neuroticism, Ravenclow'], #Ravenclow: O, C, E, A, N
['Openness, Slytherin'], ['Conscientiousness, Slytherin'], ['Extroversion, Slytherin'], ['Agreebleness, Slytherin'], ['Neuroticism, Slytherin'], #Slytherin: O, C, E, A, N
]
labels = ['Griffindor', 'Hufflepuff', 'Ravenclow', 'Slytherin']
columns = ['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']
X, y, all_norm_copy = algo(features, labels, 1000000) #features devono essere sotto forma di sd, mean, alpha
#entire dataset
X.reset_index(drop=True, inplace=True)
X.columns = columns
y.reset_index(drop=True, inplace=True)
y.columns = ['House']
a = pd.concat([X, y], axis=1) #se non resettiamo gli index da errore
a.head()
#Graphing
import matplotlib.pyplot as plt
import seaborn as sns
sns.pairplot(a, hue="House", height=5)
#graphing all distributions
#converting the list into a unique DataFrame
distributions_combined = pd.DataFrame()
for k in range(len(all_norm_copy)):
all_norm_copy[k] = pd.DataFrame(all_norm_copy[k])
all_norm_copy[k].columns = names[k]
distributions_combined = pd.concat([distributions_combined, all_norm_copy[k]], axis=1)
#distributions_combined = distributions_combined.values
distributions_combined.hist(figsize=(15, 15), grid=False, bins=100)
#Training the Model
#scaling of my personality traits
values = [85, 111, 78, 47, 74]
def calc(x, max):
return (x/max)*50
for m in range(len(values)):
values[m] = calc(values[m], 120)
values
#splitting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#Gaussian Naive Bayes
import numpy as np
from sklearn.naive_bayes import GaussianNB
#creating the model
clf = GaussianNB()
#training the model
clf.fit(X_train, y_train)
#evaluating the model
print(clf.score(X_test, y_test))
#predicting my results
print(clf.predict([values]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment