arditobryan · May 4, 2020 23:06
diff --git a/Harry Potter Sorting Hat b/Harry Potter Sorting Hat
 #Generating the Data

 #convert boxplots to mean, sd, alpha
 def features_creator(boxplots):

  def boxplot_to_normal(Q1, Q2, Q3):
    sd = (Q3-Q1)*(3/4)
    mean = (Q1+(Q3-Q1)/2) #/(Q3-Q1)
    alpha = ((Q3-Q2)/(Q2-Q1)) #/(Q3-Q1)
    return mean, sd, alpha
  
  features = boxplots
  
  for i in range(len(boxplots)):
    mean, sd, alpha = boxplot_to_normal(boxplots[i][0], boxplots[i][1], boxplots[i][2]) #Q1, Q2, Q3
    features[i][0] = mean
    features[i][1] = sd
    features[i][2] = alpha
  return features

 #Boxplots con reference value at .4, variation for .1 range are 206 pixels
 boxplots = [ #Q1, Q2, Q3 in pixels
     [-64, 21, 85], [-167, -83, -19], [-311, -206, -60], [-144, 22, 85], [-211, -186, -81], #Griffindor: O, C, E, A, N
     [-41, 42, 105], [-147, -83, -19], [-373, -228, -101], [-106, 22, 85], [-333, -206, -81], #Hufflepuff: O, C, E, A, N
     [-64, 21, 85], [-147, -83, -19], [-351, -228, -123], [-44, 41, 126], [-333, -227, -101], #Ravenclow: O, C, E, A, N
     [-188, -83, 23], [-188, -123, -60], [-373, -228, -141], [-64, 21, 85], [-352, -227, -101], #Slytherin: O, C, E, A, N
    ]

 #conversion pixels to real numbers on a scale [0, .5]
 for i in range(len(boxplots)):
  for a in range(len(boxplots[i])):
    boxplots[i][a] = (4+(boxplots[i][a]/206))/10

 boxplots_copy = [x[:] for x in boxplots] #duplicate list
 features = features_creator(boxplots) #conversion from boxplots (with Q1, Q2, Q3) to features (means, sd, alpha)
 features

 #we create the distributions based the means and standard deviations
 import pandas as pd
 import numpy as np
 from scipy.stats import skewnorm
 def algo(features, classes, size):

  def calc(x, sd, mean):
    return (x*sd)+mean

  all_norm = list()

  for values in features:
    k = skewnorm.rvs(values[2], size=size)
    k = calc(k, values[1], values[0])
    all_norm.append(k)
    #all_norm.append(np.random.normal(values[0], values[1], size)) #A, B, C, D, E, F, G, H...
  all_norm_copy = all_norm.copy() #facciamo una copia della lista, quando la grafiamo con la normal distribution allora è piena: grafando un df si ottengono linee

  feature_n = len(features) #8
  classes_n = len(classes) #2
  column_n = feature_n/classes_n #4
  X = pd.DataFrame()

  for i in range(len(all_norm)):
    all_norm[i] = pd.DataFrame(all_norm[i])
  
  col = pd.DataFrame()
  for c in range(int(classes_n)): #per ogni colonna
    beginning = int(c*column_n) #se in caso ci saranno dei problemi sono qui
    end = int(beginning+column_n)
    col = pd.concat(all_norm[beginning:end], axis=1)
    X = pd.concat([X, col], axis=0)

  y = pd.DataFrame([0]*(size*int(classes_n)))
  for m in range(classes_n):
    beginning = int(m*size)
    end = int((beginning+size)*int(classes_n))
    y[beginning:end] = classes[m]

  return X, y, all_norm_copy

 names = [
     ['Openness, Griffindor'], ['Conscientiousness, Griffindor'], ['Extroversion, Griffindor'], ['Agreebleness, Griffindor'], ['Neuroticism, Griffindor'], #Griffindor: O, C, E, A, N
     ['Openness, Hufflepuff'], ['Conscientiousness, Hufflepuff'], ['Extroversion, Hufflepuff'], ['Agreebleness, Hufflepuff'], ['Neuroticism, Hufflepuff'], #Hufflepuff: O, C, E, A, N
     ['Openness, Ravenclow'], ['Conscientiousness, Ravenclow'], ['Extroversion, Ravenclow'], ['Agreebleness, Ravenclow'], ['Neuroticism, Ravenclow'], #Ravenclow: O, C, E, A, N
     ['Openness, Slytherin'], ['Conscientiousness, Slytherin'], ['Extroversion, Slytherin'], ['Agreebleness, Slytherin'], ['Neuroticism, Slytherin'], #Slytherin: O, C, E, A, N
    ] 

 labels = ['Griffindor', 'Hufflepuff', 'Ravenclow', 'Slytherin']
 columns = ['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']

 X, y, all_norm_copy = algo(features, labels, 1000000) #features devono essere sotto forma di sd, mean, alpha

 #entire dataset
 X.reset_index(drop=True, inplace=True)
 X.columns = columns
 y.reset_index(drop=True, inplace=True)
 y.columns = ['House']
 a = pd.concat([X, y], axis=1) #se non resettiamo gli index da errore
 a.head()

 #Graphing

 import matplotlib.pyplot as plt
 import seaborn as sns

 sns.pairplot(a, hue="House", height=5)

 #graphing all distributions

 #converting the list into a unique DataFrame
 distributions_combined = pd.DataFrame()
 for k in range(len(all_norm_copy)):
  all_norm_copy[k] = pd.DataFrame(all_norm_copy[k])
  all_norm_copy[k].columns = names[k]
  distributions_combined = pd.concat([distributions_combined, all_norm_copy[k]], axis=1)

 #distributions_combined = distributions_combined.values
 distributions_combined.hist(figsize=(15, 15), grid=False, bins=100)

 #Training the Model

 #scaling of my personality traits
 values = [85, 111, 78, 47, 74]
 def calc(x, max):
  return (x/max)*50

 for m in range(len(values)):
  values[m] = calc(values[m], 120)
 values

 #splitting the dataset
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11, shuffle=True)
 print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

 #Gaussian Naive Bayes
 import numpy as np
 from sklearn.naive_bayes import GaussianNB

 #creating the model
 clf = GaussianNB()

 #training the model
 clf.fit(X_train, y_train)

 #evaluating the model
 print(clf.score(X_test, y_test))

 #predicting my results
 print(clf.predict([values]))
	#Generating the Data

	#convert boxplots to mean, sd, alpha
	def features_creator(boxplots):

	def boxplot_to_normal(Q1, Q2, Q3):
	sd = (Q3-Q1)*(3/4)
	mean = (Q1+(Q3-Q1)/2) #/(Q3-Q1)
	alpha = ((Q3-Q2)/(Q2-Q1)) #/(Q3-Q1)
	return mean, sd, alpha

	features = boxplots

	for i in range(len(boxplots)):
	mean, sd, alpha = boxplot_to_normal(boxplots[i][0], boxplots[i][1], boxplots[i][2]) #Q1, Q2, Q3
	features[i][0] = mean
	features[i][1] = sd
	features[i][2] = alpha
	return features

	#Boxplots con reference value at .4, variation for .1 range are 206 pixels
	boxplots = [ #Q1, Q2, Q3 in pixels
	[-64, 21, 85], [-167, -83, -19], [-311, -206, -60], [-144, 22, 85], [-211, -186, -81], #Griffindor: O, C, E, A, N
	[-41, 42, 105], [-147, -83, -19], [-373, -228, -101], [-106, 22, 85], [-333, -206, -81], #Hufflepuff: O, C, E, A, N
	[-64, 21, 85], [-147, -83, -19], [-351, -228, -123], [-44, 41, 126], [-333, -227, -101], #Ravenclow: O, C, E, A, N
	[-188, -83, 23], [-188, -123, -60], [-373, -228, -141], [-64, 21, 85], [-352, -227, -101], #Slytherin: O, C, E, A, N
	]

	#conversion pixels to real numbers on a scale [0, .5]
	for i in range(len(boxplots)):
	for a in range(len(boxplots[i])):
	boxplots[i][a] = (4+(boxplots[i][a]/206))/10

	boxplots_copy = [x[:] for x in boxplots] #duplicate list
	features = features_creator(boxplots) #conversion from boxplots (with Q1, Q2, Q3) to features (means, sd, alpha)
	features

	#we create the distributions based the means and standard deviations
	import pandas as pd
	import numpy as np
	from scipy.stats import skewnorm
	def algo(features, classes, size):

	def calc(x, sd, mean):
	return (x*sd)+mean

	all_norm = list()

	for values in features:
	k = skewnorm.rvs(values[2], size=size)
	k = calc(k, values[1], values[0])
	all_norm.append(k)
	#all_norm.append(np.random.normal(values[0], values[1], size)) #A, B, C, D, E, F, G, H...
	all_norm_copy = all_norm.copy() #facciamo una copia della lista, quando la grafiamo con la normal distribution allora è piena: grafando un df si ottengono linee

	feature_n = len(features) #8
	classes_n = len(classes) #2
	column_n = feature_n/classes_n #4
	X = pd.DataFrame()

	for i in range(len(all_norm)):
	all_norm[i] = pd.DataFrame(all_norm[i])

	col = pd.DataFrame()
	for c in range(int(classes_n)): #per ogni colonna
	beginning = int(c*column_n) #se in caso ci saranno dei problemi sono qui
	end = int(beginning+column_n)
	col = pd.concat(all_norm[beginning:end], axis=1)
	X = pd.concat([X, col], axis=0)

	y = pd.DataFrame([0](sizeint(classes_n)))
	for m in range(classes_n):
	beginning = int(m*size)
	end = int((beginning+size)*int(classes_n))
	y[beginning:end] = classes[m]

	return X, y, all_norm_copy

	names = [
	['Openness, Griffindor'], ['Conscientiousness, Griffindor'], ['Extroversion, Griffindor'], ['Agreebleness, Griffindor'], ['Neuroticism, Griffindor'], #Griffindor: O, C, E, A, N
	['Openness, Hufflepuff'], ['Conscientiousness, Hufflepuff'], ['Extroversion, Hufflepuff'], ['Agreebleness, Hufflepuff'], ['Neuroticism, Hufflepuff'], #Hufflepuff: O, C, E, A, N
	['Openness, Ravenclow'], ['Conscientiousness, Ravenclow'], ['Extroversion, Ravenclow'], ['Agreebleness, Ravenclow'], ['Neuroticism, Ravenclow'], #Ravenclow: O, C, E, A, N
	['Openness, Slytherin'], ['Conscientiousness, Slytherin'], ['Extroversion, Slytherin'], ['Agreebleness, Slytherin'], ['Neuroticism, Slytherin'], #Slytherin: O, C, E, A, N
	]

	labels = ['Griffindor', 'Hufflepuff', 'Ravenclow', 'Slytherin']
	columns = ['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']

	X, y, all_norm_copy = algo(features, labels, 1000000) #features devono essere sotto forma di sd, mean, alpha

	#entire dataset
	X.reset_index(drop=True, inplace=True)
	X.columns = columns
	y.reset_index(drop=True, inplace=True)
	y.columns = ['House']
	a = pd.concat([X, y], axis=1) #se non resettiamo gli index da errore
	a.head()

	#Graphing

	import matplotlib.pyplot as plt
	import seaborn as sns

	sns.pairplot(a, hue="House", height=5)

	#graphing all distributions

	#converting the list into a unique DataFrame
	distributions_combined = pd.DataFrame()
	for k in range(len(all_norm_copy)):
	all_norm_copy[k] = pd.DataFrame(all_norm_copy[k])
	all_norm_copy[k].columns = names[k]
	distributions_combined = pd.concat([distributions_combined, all_norm_copy[k]], axis=1)

	#distributions_combined = distributions_combined.values
	distributions_combined.hist(figsize=(15, 15), grid=False, bins=100)

	#Training the Model

	#scaling of my personality traits
	values = [85, 111, 78, 47, 74]
	def calc(x, max):
	return (x/max)*50

	for m in range(len(values)):
	values[m] = calc(values[m], 120)
	values

	#splitting the dataset
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11, shuffle=True)
	print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

	#Gaussian Naive Bayes
	import numpy as np
	from sklearn.naive_bayes import GaussianNB

	#creating the model
	clf = GaussianNB()

	#training the model
	clf.fit(X_train, y_train)

	#evaluating the model
	print(clf.score(X_test, y_test))

	#predicting my results
	print(clf.predict([values]))