Skip to content

Instantly share code, notes, and snippets.

View vlavorini's full-sized avatar

Vincenzo Lavorini vlavorini

View GitHub Profile
#run the dimensionality reduction
embedding = umap.UMAP(n_neighbors=55).fit_transform(df_preprocessed.values)
#plotting results
plt.figure(figsize=(16,6))
plt.scatter(embedding_best[:, 0], embedding_sel[:, 1], color='b', s=10,
alpha=.2, label="Customers", marker='o')
plt.legend(fontsize=16, numpoints=3)
plt.show()
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer
#loading the data
df=pd.read_csv("train.csv")
#counting the null features per user
df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1)
def __shift(points, sigma, beg, end):
dists=dist_batch(points, beg, end)
weights = gaussian(dists, sigma)
expd_w=np.dot(weights.T, points)
summed_weight=np.sum(weights,0)
return expd_w/np.expand_dims(summed_weight,1), beg, end # we return also the position of the starting and ending vectors
def meanshift_parallel(points, batches)
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
future_shifting = {executor.submit(__shift, points, sigma, d_beg, min(d_beg+batches, n_samples)):
def __shift(points, sigma, beg, end):
dists=dist_batch(points, beg, end) # the distance function just defined
weights = gaussian(dists, sigma)
expd_w=np.dot(weights.T, points)
summed_weight=np.sum(weights,0)
return expd_w/np.expand_dims(summed_weight,1) # we return the shifted points
def meanshift_batch(points, batches):
for i in range(0,n_samples,batches): # we need to loop for each batch, until reach the end
last=min(i+batches,n_samples) # we have to select which is the last vector for feeding the following functions
def dist_batch(points, n_begin, n_end):
expd=np.expand_dims(points,2)
tiled=np.tile(expd, n_end-n_begin) #we tile up only (n_end-n_begin) times
selected=points[n_begin:n_end] # we select only part (a batch) of the whole dataset
trans=np.transpose(selected)
diff=trans-tiled
num=np.sum(np.square(diff), axis=1)
den_sq_norm=1-np.sum(np.square(points),1)
den_selected=den_sq_norm[n_begin:n_end] # we select only part (a batch) of the whole dataset
def gaussian(d, bw):
return np.exp(-0.5*(d/bw)**2) / (bw*np.sqrt(2*np.pi))
def meanshift_vec(points):
dists=poinc_dist_vec(points) #the matrix of the distances
weights = gaussian(dists, sigma) #the matrix of the weights
expd_w=np.dot(weights, points) #the weighted vectors
summed_weight=np.sum(weights,0) # the array of the summed weights, for normalize the weighted vectors
shifted_pts=expd_w/np.expand_dims(summed_weight,1) #the normalized vectors
return shifted_pts
def poinc_dist_vec(points):
num=num(points)
den=num(points)
return np.arccosh(1+2*num/ den)
def den(points):
sq_norm=1-np.sum(np.square(points),1) #subtracting from 1 the squared norm of the vectors
expd=np.expand_dims(sq_norm,1) #this operation is needed to obtain a correctly transposed version of the vector
den_all=expd * expd.T #multiply the object by his transpose
return den_all
def num(points):
expd=np.expand_dims(points,2) #need another dimension...
tiled=np.tile(expd, points.shape[0]) #...to tile up the vectors
trans=np.transpose(points) #Also need to transpose the points matrix to fit well with broadcasting
diff=trans-tiled #doing the difference, exploiting Numpy broadcasting capabilities
num=np.sum(np.square(diff), axis=1) #an then obtain the squared norm of the difference
return num
def _dist_poinc(a, b):
num=np.dot(a-b, a-b)
den1=1-np.dot(a,a)
den2=1-np.dot(b,b)
return np.arccosh(1+ 2* (num) / (den1*den2))
def dist_poinc(a, A):
res=np.empty(A.shape[0])
for i, el in enumerate(A):
res[i]=_dist_poinc(a, el)
return res