Andrea D'Agostino andrea-dagostino

Data scientist. Founder of diariodiunanalista.it and writer @ Medium

andrea-dagostino / overfitting_example_eng1.py

Created August 23, 2022 12:14

	import pandas as pd

	df = pd.read_csv('wineQualityReds.csv') # download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset

	# since the dataset contains the target variable in a range between 3 and 8, we map them from 1 to 5.
	quality_mapping = {
	3: 0,
	4: 1,
	5: 2,
	6: 3,

andrea-dagostino / overfitting_example_ita2.py

Created August 22, 2022 11:17

overfitting_example_ita

	train_accs = []
	test_accs = []

	cols = [
	'fixed.acidity', 'volatile.acidity', 'citric.acid','residual.sugar', 'chlorides', 'free.sulfur.dioxide',
	'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
	]

	# inizializziamo un loop dove cambieremo il valore di max depth, partendo da 1 a 25
	for depth in range(1, 25):

andrea-dagostino / overfitting_example_ita.py

Created August 22, 2022 10:45

overfitting_example

	import pandas as pd

	df = pd.read_csv('wineQualityReds.csv') # download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset

	# poiché il dataset contiene solo i numeri da 3 a 8, rimappiamoli con i numeri da 1 a 5.
	quality_mapping = {
	3: 0,
	4: 1,
	5: 2,
	6: 3,

andrea-dagostino / ts_clustering_ita1.py

Last active August 6, 2022 17:42

ts_clustering

	N = 15 # window size --> possiamo modificare questo parametro per sperimentare
	K = 0.70 # split size --> 70% dei dati è in A, 30% in B

	SEQS = split_time_series(list(data['Close'].values), N) # crea sequenze di lunghezza N

	SPLIT_SEQS = split_sequences(SEQS, K) # divide le sequenze in due

	A = [seq[0] for seq in SPLIT_SEQS]
	B = [seq[1] for seq in SPLIT_SEQS]

andrea-dagostino / ts_clustering_eng_trends.py

Last active August 1, 2022 22:42

ts_clustering

	def classify_trend(b, threshold=0.05):
	"""
	Classify the trend of a vector
	"""
	# compute slope
	slope = np.mean(np.diff(b) / np.diff(np.arange(len(b))))
	# if slope is positive, the trend is upward
	if slope + (slope * threshold) > 0:
	return 1
	# if slope is negative, the trend is downward

andrea-dagostino / ts_clustering_eng_groups.py

Created August 1, 2022 21:18

ts_clustering

	# populate G
	G = {}
	THRESHOLD = 6 # arbitrary value - tweak this to get different results

	for i in range(len(S)):
	G[i] = []
	for j in range(len(S)):
	if S[i, j] < THRESHOLD and i != j and (i, j) not in G and (j, i) not in G and j not in G[i]:
	G[i].append(j)

andrea-dagostino / ts_clustering_eng_heatmap.py

Last active August 1, 2022 19:02

ts_clustering

	# plot heatmap of S
	fig, ax = plt.subplots(figsize=(20, 10))
	sns.heatmap(S, cmap='nipy_spectral_r', square=True, ax=ax)
	plt.title("Heatmap of sequence similarities", fontsize=20, fontweight='bold')
	plt.xticks(range(len(A)), range(len(A)))
	plt.yticks(range(len(A)), range(len(A)))
	plt.show()

andrea-dagostino / ts_clustering_similarity_eng.py

Last active August 1, 2022 13:26

ts_clustering

	def compute_correlation(a1, a2):
	"""
	Calculate the correlation between two vectors
	"""
	return np.corrcoef(a1, a2)[0, 1]

	def compute_dynamic_time_warping(a1, a2):
	"""
	Compute the dynamic time warping between two sequences
	"""

andrea-dagostino / ts_clustering_eng4.py

Last active August 1, 2022 18:58

ts_clustering

	N = 15 # window size --> we can tweak it and test different options
	K = 0.70 # split size --> 80% of the data is in A, 20% in B

	SEQS = split_time_series(list(data['Close'].values), N) # creates sequences of length N

	SPLIT_SEQS = split_sequences(SEQS, K) # splits sequences into individual sequences

	A = [seq[0] for seq in SPLIT_SEQS]
	B = [seq[1] for seq in SPLIT_SEQS]

andrea-dagostino / ts_clustering_eng3.py

Last active August 1, 2022 09:45

ts_clustering

	def split_sequence(sequence, k):
	"""
	Split a sequence in two, where k is the size of the first sequence
	"""
	return np.array(sequence[:int(len(sequence) * k)]), np.array(sequence[int(len(sequence) * k):])


	def split_sequences(sequences, k=0.80):
	"""
	Applies split_sequence on all elements of a list or array