This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#run the dimensionality reduction | |
embedding = umap.UMAP(n_neighbors=55).fit_transform(df_preprocessed.values) | |
#plotting results | |
plt.figure(figsize=(16,6)) | |
plt.scatter(embedding_best[:, 0], embedding_sel[:, 1], color='b', s=10, | |
alpha=.2, label="Customers", marker='o') | |
plt.legend(fontsize=16, numpoints=3) | |
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn_pandas import DataFrameMapper | |
from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer | |
#loading the data | |
df=pd.read_csv("train.csv") | |
#counting the null features per user | |
df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def __shift(points, sigma, beg, end): | |
dists=dist_batch(points, beg, end) | |
weights = gaussian(dists, sigma) | |
expd_w=np.dot(weights.T, points) | |
summed_weight=np.sum(weights,0) | |
return expd_w/np.expand_dims(summed_weight,1), beg, end # we return also the position of the starting and ending vectors | |
def meanshift_parallel(points, batches) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: | |
future_shifting = {executor.submit(__shift, points, sigma, d_beg, min(d_beg+batches, n_samples)): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def __shift(points, sigma, beg, end): | |
dists=dist_batch(points, beg, end) # the distance function just defined | |
weights = gaussian(dists, sigma) | |
expd_w=np.dot(weights.T, points) | |
summed_weight=np.sum(weights,0) | |
return expd_w/np.expand_dims(summed_weight,1) # we return the shifted points | |
def meanshift_batch(points, batches): | |
for i in range(0,n_samples,batches): # we need to loop for each batch, until reach the end | |
last=min(i+batches,n_samples) # we have to select which is the last vector for feeding the following functions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def dist_batch(points, n_begin, n_end): | |
expd=np.expand_dims(points,2) | |
tiled=np.tile(expd, n_end-n_begin) #we tile up only (n_end-n_begin) times | |
selected=points[n_begin:n_end] # we select only part (a batch) of the whole dataset | |
trans=np.transpose(selected) | |
diff=trans-tiled | |
num=np.sum(np.square(diff), axis=1) | |
den_sq_norm=1-np.sum(np.square(points),1) | |
den_selected=den_sq_norm[n_begin:n_end] # we select only part (a batch) of the whole dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def gaussian(d, bw): | |
return np.exp(-0.5*(d/bw)**2) / (bw*np.sqrt(2*np.pi)) | |
def meanshift_vec(points): | |
dists=poinc_dist_vec(points) #the matrix of the distances | |
weights = gaussian(dists, sigma) #the matrix of the weights | |
expd_w=np.dot(weights, points) #the weighted vectors | |
summed_weight=np.sum(weights,0) # the array of the summed weights, for normalize the weighted vectors | |
shifted_pts=expd_w/np.expand_dims(summed_weight,1) #the normalized vectors | |
return shifted_pts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def poinc_dist_vec(points): | |
num=num(points) | |
den=num(points) | |
return np.arccosh(1+2*num/ den) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def den(points): | |
sq_norm=1-np.sum(np.square(points),1) #subtracting from 1 the squared norm of the vectors | |
expd=np.expand_dims(sq_norm,1) #this operation is needed to obtain a correctly transposed version of the vector | |
den_all=expd * expd.T #multiply the object by his transpose | |
return den_all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def num(points): | |
expd=np.expand_dims(points,2) #need another dimension... | |
tiled=np.tile(expd, points.shape[0]) #...to tile up the vectors | |
trans=np.transpose(points) #Also need to transpose the points matrix to fit well with broadcasting | |
diff=trans-tiled #doing the difference, exploiting Numpy broadcasting capabilities | |
num=np.sum(np.square(diff), axis=1) #an then obtain the squared norm of the difference | |
return num |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def _dist_poinc(a, b): | |
num=np.dot(a-b, a-b) | |
den1=1-np.dot(a,a) | |
den2=1-np.dot(b,b) | |
return np.arccosh(1+ 2* (num) / (den1*den2)) | |
def dist_poinc(a, A): | |
res=np.empty(A.shape[0]) | |
for i, el in enumerate(A): | |
res[i]=_dist_poinc(a, el) | |
return res |