Skip to content

Instantly share code, notes, and snippets.

@andrea-dagostino
Created August 23, 2022 12:14
Show Gist options
  • Select an option

  • Save andrea-dagostino/d0608b8273bc2cdcc418ab43c8e5d97a to your computer and use it in GitHub Desktop.

Select an option

Save andrea-dagostino/d0608b8273bc2cdcc418ab43c8e5d97a to your computer and use it in GitHub Desktop.
import pandas as pd
df = pd.read_csv('wineQualityReds.csv') # download dataset -> https://www.kaggle.com/datasets/piyushgoyal443/red-wine-dataset
# since the dataset contains the target variable in a range between 3 and 8, we map them from 1 to 5.
quality_mapping = {
3: 0,
4: 1,
5: 2,
6: 3,
7: 4,
8: 5
}
df.loc[:, 'quality'] = df.quality.map(quality_mapping)
# split the dataset in two portions, training and test sets.
# since the dataset has 1599 examples, we'll use 1000 for training and 599 for test
# we use frac=1 to shuffle the data and reset the index
df = df.sample(frac=1).reset_index(drop=True)
df_train = df.head(1000)
df_test = df.tail(599)
# now we train a decision tree on the columns of interest
from sklearn import tree
from sklearn import metrics
cols = [
'fixed.acidity', 'volatile.acidity', 'citric.acid','residual.sugar', 'chlorides', 'free.sulfur.dioxide',
'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
]
# train the model
clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(df_train[cols], df_train.quality)
# create predictions
train_predictions = clf.predict(df_train[cols])
test_predictions = clf.predict(df_test[cols])
# compute accuracy
train_accuracy = metrics.accuracy_score(df_train.quality, train_predictions)
test_accuracy = metrics.accuracy_score(df_test.quality, test_predictions)
print(f"Train accuracy: {round(train_accuracy, 3)}")
print(f"Test accuracy: {round(test_accuracy, 3)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment