Skip to content

Instantly share code, notes, and snippets.

@kwinkunks
Created June 17, 2022 18:16
Show Gist options
  • Save kwinkunks/d35a7668cec471ca7e3640639cd77dbb to your computer and use it in GitHub Desktop.
Save kwinkunks/d35a7668cec471ca7e3640639cd77dbb to your computer and use it in GitHub Desktop.
Why does adding features sometimes make a worse classifier?
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
iris = datasets.load_iris()
count = 0
for seed in range(0, 200):
scores = []
for n_features in range(1, 5):
X_train, X_test, y_train, y_test = train_test_split(iris.data[:, :n_features],
iris.target,
random_state=seed)
# Make noisy data that will train an imperfect model.
rng = np.random.default_rng(seed)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train) + rng.normal(size=X_train.shape)
X_test = scaler.transform(X_test)
# Fit a model with whatever hyperparameters.
clf = LogisticRegression(penalty='l1', solver='liblinear', C=0.01)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
scores.append(f1_score(y_test, y_pred, average='weighted'))
# Count the occasions on which scores do not monotonically increase.
if any(np.diff(scores) < 0):
count += 1
# For L1 regularizaiton and small C, count is 0.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment