Created
May 4, 2020 23:06
-
-
Save arditobryan/01fb3fcbb126230c2929ff840f3dfe48 to your computer and use it in GitHub Desktop.
Harry Potter Sorting Hat
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Generating the Data | |
#convert boxplots to mean, sd, alpha | |
def features_creator(boxplots): | |
def boxplot_to_normal(Q1, Q2, Q3): | |
sd = (Q3-Q1)*(3/4) | |
mean = (Q1+(Q3-Q1)/2) #/(Q3-Q1) | |
alpha = ((Q3-Q2)/(Q2-Q1)) #/(Q3-Q1) | |
return mean, sd, alpha | |
features = boxplots | |
for i in range(len(boxplots)): | |
mean, sd, alpha = boxplot_to_normal(boxplots[i][0], boxplots[i][1], boxplots[i][2]) #Q1, Q2, Q3 | |
features[i][0] = mean | |
features[i][1] = sd | |
features[i][2] = alpha | |
return features | |
#Boxplots con reference value at .4, variation for .1 range are 206 pixels | |
boxplots = [ #Q1, Q2, Q3 in pixels | |
[-64, 21, 85], [-167, -83, -19], [-311, -206, -60], [-144, 22, 85], [-211, -186, -81], #Griffindor: O, C, E, A, N | |
[-41, 42, 105], [-147, -83, -19], [-373, -228, -101], [-106, 22, 85], [-333, -206, -81], #Hufflepuff: O, C, E, A, N | |
[-64, 21, 85], [-147, -83, -19], [-351, -228, -123], [-44, 41, 126], [-333, -227, -101], #Ravenclow: O, C, E, A, N | |
[-188, -83, 23], [-188, -123, -60], [-373, -228, -141], [-64, 21, 85], [-352, -227, -101], #Slytherin: O, C, E, A, N | |
] | |
#conversion pixels to real numbers on a scale [0, .5] | |
for i in range(len(boxplots)): | |
for a in range(len(boxplots[i])): | |
boxplots[i][a] = (4+(boxplots[i][a]/206))/10 | |
boxplots_copy = [x[:] for x in boxplots] #duplicate list | |
features = features_creator(boxplots) #conversion from boxplots (with Q1, Q2, Q3) to features (means, sd, alpha) | |
features | |
#we create the distributions based the means and standard deviations | |
import pandas as pd | |
import numpy as np | |
from scipy.stats import skewnorm | |
def algo(features, classes, size): | |
def calc(x, sd, mean): | |
return (x*sd)+mean | |
all_norm = list() | |
for values in features: | |
k = skewnorm.rvs(values[2], size=size) | |
k = calc(k, values[1], values[0]) | |
all_norm.append(k) | |
#all_norm.append(np.random.normal(values[0], values[1], size)) #A, B, C, D, E, F, G, H... | |
all_norm_copy = all_norm.copy() #facciamo una copia della lista, quando la grafiamo con la normal distribution allora è piena: grafando un df si ottengono linee | |
feature_n = len(features) #8 | |
classes_n = len(classes) #2 | |
column_n = feature_n/classes_n #4 | |
X = pd.DataFrame() | |
for i in range(len(all_norm)): | |
all_norm[i] = pd.DataFrame(all_norm[i]) | |
col = pd.DataFrame() | |
for c in range(int(classes_n)): #per ogni colonna | |
beginning = int(c*column_n) #se in caso ci saranno dei problemi sono qui | |
end = int(beginning+column_n) | |
col = pd.concat(all_norm[beginning:end], axis=1) | |
X = pd.concat([X, col], axis=0) | |
y = pd.DataFrame([0]*(size*int(classes_n))) | |
for m in range(classes_n): | |
beginning = int(m*size) | |
end = int((beginning+size)*int(classes_n)) | |
y[beginning:end] = classes[m] | |
return X, y, all_norm_copy | |
names = [ | |
['Openness, Griffindor'], ['Conscientiousness, Griffindor'], ['Extroversion, Griffindor'], ['Agreebleness, Griffindor'], ['Neuroticism, Griffindor'], #Griffindor: O, C, E, A, N | |
['Openness, Hufflepuff'], ['Conscientiousness, Hufflepuff'], ['Extroversion, Hufflepuff'], ['Agreebleness, Hufflepuff'], ['Neuroticism, Hufflepuff'], #Hufflepuff: O, C, E, A, N | |
['Openness, Ravenclow'], ['Conscientiousness, Ravenclow'], ['Extroversion, Ravenclow'], ['Agreebleness, Ravenclow'], ['Neuroticism, Ravenclow'], #Ravenclow: O, C, E, A, N | |
['Openness, Slytherin'], ['Conscientiousness, Slytherin'], ['Extroversion, Slytherin'], ['Agreebleness, Slytherin'], ['Neuroticism, Slytherin'], #Slytherin: O, C, E, A, N | |
] | |
labels = ['Griffindor', 'Hufflepuff', 'Ravenclow', 'Slytherin'] | |
columns = ['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism'] | |
X, y, all_norm_copy = algo(features, labels, 1000000) #features devono essere sotto forma di sd, mean, alpha | |
#entire dataset | |
X.reset_index(drop=True, inplace=True) | |
X.columns = columns | |
y.reset_index(drop=True, inplace=True) | |
y.columns = ['House'] | |
a = pd.concat([X, y], axis=1) #se non resettiamo gli index da errore | |
a.head() | |
#Graphing | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
sns.pairplot(a, hue="House", height=5) | |
#graphing all distributions | |
#converting the list into a unique DataFrame | |
distributions_combined = pd.DataFrame() | |
for k in range(len(all_norm_copy)): | |
all_norm_copy[k] = pd.DataFrame(all_norm_copy[k]) | |
all_norm_copy[k].columns = names[k] | |
distributions_combined = pd.concat([distributions_combined, all_norm_copy[k]], axis=1) | |
#distributions_combined = distributions_combined.values | |
distributions_combined.hist(figsize=(15, 15), grid=False, bins=100) | |
#Training the Model | |
#scaling of my personality traits | |
values = [85, 111, 78, 47, 74] | |
def calc(x, max): | |
return (x/max)*50 | |
for m in range(len(values)): | |
values[m] = calc(values[m], 120) | |
values | |
#splitting the dataset | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11, shuffle=True) | |
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) | |
#Gaussian Naive Bayes | |
import numpy as np | |
from sklearn.naive_bayes import GaussianNB | |
#creating the model | |
clf = GaussianNB() | |
#training the model | |
clf.fit(X_train, y_train) | |
#evaluating the model | |
print(clf.score(X_test, y_test)) | |
#predicting my results | |
print(clf.predict([values])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment