Vincenzo Lavorini vlavorini

Solving PyRFR issue (SMAC or auto-sklearn)

If you already have a version of GCC > 7.2, you can skip this step. Otherwise you have to install it: brew install gcc ;
make sure the symbolic links are created, i.e. /usr/local/bin/gcc -> gcc-7, /usr/local/bin/g++ -> g++-7
- I used the version 7 of GCC, but probably a newer version will work too.
tell PIP to compile the package, avoiding the pre-compiled binaries: pip install --force-reinstall --ignore-installed --no-binary :all: pyrfr

	def gaussian(d, bw):
	return np.exp(-0.5(d/bw)2) / (bwnp.sqrt(2*np.pi))
	def meanshift_vec(points):
	dists=poinc_dist_vec(points) #the matrix of the distances
	weights = gaussian(dists, sigma) #the matrix of the weights
	expd_w=np.dot(weights, points) #the weighted vectors
	summed_weight=np.sum(weights,0) # the array of the summed weights, for normalize the weighted vectors
	shifted_pts=expd_w/np.expand_dims(summed_weight,1) #the normalized vectors
	return shifted_pts

	def dist_batch(points, n_begin, n_end):
	expd=np.expand_dims(points,2)
	tiled=np.tile(expd, n_end-n_begin) #we tile up only (n_end-n_begin) times
	selected=points[n_begin:n_end] # we select only part (a batch) of the whole dataset
	trans=np.transpose(selected)
	diff=trans-tiled
	num=np.sum(np.square(diff), axis=1)

	den_sq_norm=1-np.sum(np.square(points),1)
	den_selected=den_sq_norm[n_begin:n_end] # we select only part (a batch) of the whole dataset

	def __shift(points, sigma, beg, end):
	dists=dist_batch(points, beg, end) # the distance function just defined
	weights = gaussian(dists, sigma)
	expd_w=np.dot(weights.T, points)
	summed_weight=np.sum(weights,0)
	return expd_w/np.expand_dims(summed_weight,1) # we return the shifted points

	def meanshift_batch(points, batches):
	for i in range(0,n_samples,batches): # we need to loop for each batch, until reach the end
	last=min(i+batches,n_samples) # we have to select which is the last vector for feeding the following functions

	def __shift(points, sigma, beg, end):
	dists=dist_batch(points, beg, end)
	weights = gaussian(dists, sigma)
	expd_w=np.dot(weights.T, points)
	summed_weight=np.sum(weights,0)
	return expd_w/np.expand_dims(summed_weight,1), beg, end # we return also the position of the starting and ending vectors

	def meanshift_parallel(points, batches)
	with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
	future_shifting = {executor.submit(__shift, points, sigma, d_beg, min(d_beg+batches, n_samples)):

	import pandas as pd
	from sklearn_pandas import DataFrameMapper
	from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer

	#loading the data
	df=pd.read_csv("train.csv")

	#counting the null features per user
	df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1)

	#run the dimensionality reduction
	embedding = umap.UMAP(n_neighbors=55).fit_transform(df_preprocessed.values)

	#plotting results
	plt.figure(figsize=(16,6))
	plt.scatter(embedding_best[:, 0], embedding_sel[:, 1], color='b', s=10,
	alpha=.2, label="Customers", marker='o')

	plt.legend(fontsize=16, numpoints=3)
	plt.show()

	from numba import jit

	@jit
	def example_function(arguments):
	#this function will be compiled by Numba
	...
	...
	...

	@jit(nopython=True, nogil=True)
	def dist_batch(points, n_begin, n_end):
	#expd=np.expand_dims(points,2)
	#tiled=np.tile(expd, n_end-n_begin) #we tile up only (n_end-n_begin) times
	tiled = np.zeros((points.shape[0], points.shape[1], n_end-n_begin))
	for dim in range(n_end-n_begin):
	tiled[:,:,dim]=points

	selected=points[n_begin:n_end] # we select only part (a batch) of the whole dataset
	trans=np.transpose(selected)

	import matplotlib.pyplot as plt

	def calc_beta_mode(a, b):
	'''this function calculate the mode (peak) of the Beta distribution'''
	return (a-1)/(a+b-2)

	def plot(betas, names, linf=0, lsup=0.01):
	'''this function plots the Beta distribution'''
	x=np.linspace(linf,lsup, 100)
	for f, name in zip(betas,names) :