Skip to content

Instantly share code, notes, and snippets.

View andrea-dagostino's full-sized avatar

Andrea D'Agostino andrea-dagostino

View GitHub Profile
import pandas as pd
df = pd.read_csv('wineQualityReds.csv') # download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset
# since the dataset contains the target variable in a range between 3 and 8, we map them from 1 to 5.
quality_mapping = {
3: 0,
4: 1,
5: 2,
6: 3,
@andrea-dagostino
andrea-dagostino / overfitting_example_ita2.py
Created August 22, 2022 11:17
overfitting_example_ita
train_accs = []
test_accs = []
cols = [
'fixed.acidity', 'volatile.acidity', 'citric.acid','residual.sugar', 'chlorides', 'free.sulfur.dioxide',
'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
]
# inizializziamo un loop dove cambieremo il valore di max depth, partendo da 1 a 25
for depth in range(1, 25):
import pandas as pd
df = pd.read_csv('wineQualityReds.csv') # download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset
# poiché il dataset contiene solo i numeri da 3 a 8, rimappiamoli con i numeri da 1 a 5.
quality_mapping = {
3: 0,
4: 1,
5: 2,
6: 3,
N = 15 # window size --> possiamo modificare questo parametro per sperimentare
K = 0.70 # split size --> 70% dei dati è in A, 30% in B
SEQS = split_time_series(list(data['Close'].values), N) # crea sequenze di lunghezza N
SPLIT_SEQS = split_sequences(SEQS, K) # divide le sequenze in due
A = [seq[0] for seq in SPLIT_SEQS]
B = [seq[1] for seq in SPLIT_SEQS]
# populate G
G = {}
THRESHOLD = 6 # arbitrary value - tweak this to get different results
for i in range(len(S)):
G[i] = []
for j in range(len(S)):
if S[i, j] < THRESHOLD and i != j and (i, j) not in G and (j, i) not in G and j not in G[i]:
G[i].append(j)
# plot heatmap of S
fig, ax = plt.subplots(figsize=(20, 10))
sns.heatmap(S, cmap='nipy_spectral_r', square=True, ax=ax)
plt.title("Heatmap of sequence similarities", fontsize=20, fontweight='bold')
plt.xticks(range(len(A)), range(len(A)))
plt.yticks(range(len(A)), range(len(A)))
plt.show()
def compute_correlation(a1, a2):
"""
Calculate the correlation between two vectors
"""
return np.corrcoef(a1, a2)[0, 1]
def compute_dynamic_time_warping(a1, a2):
"""
Compute the dynamic time warping between two sequences
"""
N = 15 # window size --> we can tweak it and test different options
K = 0.70 # split size --> 80% of the data is in A, 20% in B
SEQS = split_time_series(list(data['Close'].values), N) # creates sequences of length N
SPLIT_SEQS = split_sequences(SEQS, K) # splits sequences into individual sequences
A = [seq[0] for seq in SPLIT_SEQS]
B = [seq[1] for seq in SPLIT_SEQS]
def split_sequence(sequence, k):
"""
Split a sequence in two, where k is the size of the first sequence
"""
return np.array(sequence[:int(len(sequence) * k)]), np.array(sequence[int(len(sequence) * k):])
def split_sequences(sequences, k=0.80):
"""
Applies split_sequence on all elements of a list or array