vlavorini’s gists

vlavorini / umap_dr.py

Last active January 14, 2018 18:36

	#run the dimensionality reduction
	embedding = umap.UMAP(n_neighbors=55).fit_transform(df_preprocessed.values)

	#plotting results
	plt.figure(figsize=(16,6))
	plt.scatter(embedding_best[:, 0], embedding_sel[:, 1], color='b', s=10,
	alpha=.2, label="Customers", marker='o')

	plt.legend(fontsize=16, numpoints=3)
	plt.show()

vlavorini / Imputation_scaling.py

Last active January 14, 2018 14:09

	import pandas as pd
	from sklearn_pandas import DataFrameMapper
	from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer

	#loading the data
	df=pd.read_csv("train.csv")

	#counting the null features per user
	df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1)

vlavorini / meanshift_poinc_parallel.py

Last active September 27, 2017 17:00

	def __shift(points, sigma, beg, end):
	dists=dist_batch(points, beg, end)
	weights = gaussian(dists, sigma)
	expd_w=np.dot(weights.T, points)
	summed_weight=np.sum(weights,0)
	return expd_w/np.expand_dims(summed_weight,1), beg, end # we return also the position of the starting and ending vectors

	def meanshift_parallel(points, batches)
	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
	future_shifting = {executor.submit(__shift, points, sigma, d_beg, min(d_beg+batches, n_samples)):

vlavorini / meanshift_poinc_batch.py

Last active September 27, 2017 16:15

	def __shift(points, sigma, beg, end):
	dists=dist_batch(points, beg, end) # the distance function just defined
	weights = gaussian(dists, sigma)
	expd_w=np.dot(weights.T, points)
	summed_weight=np.sum(weights,0)
	return expd_w/np.expand_dims(summed_weight,1) # we return the shifted points

	def meanshift_batch(points, batches):
	for i in range(0,n_samples,batches): # we need to loop for each batch, until reach the end
	last=min(i+batches,n_samples) # we have to select which is the last vector for feeding the following functions

vlavorini / dist_batch.py

Created September 26, 2017 14:35

	def dist_batch(points, n_begin, n_end):
	expd=np.expand_dims(points,2)
	tiled=np.tile(expd, n_end-n_begin) #we tile up only (n_end-n_begin) times
	selected=points[n_begin:n_end] # we select only part (a batch) of the whole dataset
	trans=np.transpose(selected)
	diff=trans-tiled
	num=np.sum(np.square(diff), axis=1)

	den_sq_norm=1-np.sum(np.square(points),1)
	den_selected=den_sq_norm[n_begin:n_end] # we select only part (a batch) of the whole dataset

vlavorini / meanshift_poinc_vec.py

Created August 18, 2017 13:06

	def gaussian(d, bw):
	return np.exp(-0.5(d/bw)2) / (bwnp.sqrt(2*np.pi))
	def meanshift_vec(points):
	dists=poinc_dist_vec(points) #the matrix of the distances
	weights = gaussian(dists, sigma) #the matrix of the weights
	expd_w=np.dot(weights, points) #the weighted vectors
	summed_weight=np.sum(weights,0) # the array of the summed weights, for normalize the weighted vectors
	shifted_pts=expd_w/np.expand_dims(summed_weight,1) #the normalized vectors
	return shifted_pts

vlavorini / poinc_dist_vec.py

Created August 17, 2017 15:56

vlavorini / poinc_dist_den.py

Created August 17, 2017 15:50

	def den(points):
	sq_norm=1-np.sum(np.square(points),1) #subtracting from 1 the squared norm of the vectors
	expd=np.expand_dims(sq_norm,1) #this operation is needed to obtain a correctly transposed version of the vector
	den_all=expd * expd.T #multiply the object by his transpose
	return den_all

vlavorini / poinc_dist_num.py

Last active August 17, 2017 14:30

	def num(points):
	expd=np.expand_dims(points,2) #need another dimension...
	tiled=np.tile(expd, points.shape[0]) #...to tile up the vectors
	trans=np.transpose(points) #Also need to transpose the points matrix to fit well with broadcasting
	diff=trans-tiled #doing the difference, exploiting Numpy broadcasting capabilities
	num=np.sum(np.square(diff), axis=1) #an then obtain the squared norm of the difference
	return num

vlavorini / looped_dist.py

Created August 13, 2017 19:50

Vincenzo Lavorini vlavorini