Skip to content

Instantly share code, notes, and snippets.

@blakewest
Created February 5, 2018 22:38
Show Gist options
  • Save blakewest/c712e302bd0fe1d60d23bd26f7be40d0 to your computer and use it in GitHub Desktop.
Save blakewest/c712e302bd0fe1d60d23bd26f7be40d0 to your computer and use it in GitHub Desktop.
A starter implementation for logistic regression.
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
def load_data(path):
# Load the data into a Pandas dataframe.
raw_data = pd.read_csv(path, header=0)
print('Loaded data from', path)
return raw_data
def show_samples(data, num_samples=5):
print(raw_data[:num_samples])
print('')
print('Count by target feature:')
print(raw_data[TARGET_FEATURE].value_counts())
print('')
def get_usable_data(data, add_intercept=True, categorical_features=[]):
# Separate the X and Y values.
y_data = raw_data[TARGET_FEATURE]
x_data = raw_data.drop(TARGET_FEATURE, axis=1)
# To include an intercept, add a new column with a constant.
if add_intercept:
x_data['intercept'] = 1.0
for feature in categorical_features:
dummy_data = pd.get_dummies(x_data[feature], prefix=feature)
# We need to remove at least one of the dummy features.
# It doesn't matter which one. To stay consistent we
# usually remove the most common value.
most_common_value = pd.value_counts(x_data[feature]).index[0]
dummy_to_exclude = feature + '_' + str(most_common_value)
dummy_data_to_use = dummy_data.drop(dummy_to_exclude, axis=1)
x_data[dummy_data_to_use.columns] = dummy_data_to_use
# Remove the original feature, so just the dummy data remains.
x_data = x_data.drop(feature, axis=1)
return x_data, y_data
def evaluate(model, x_test):
# Get prediction probabilities for the test set.
y_predict_proba_raw = model.predict_proba(x_test)
# The .predict_proba() function returns the probabilities of Y
# equaling both 1 and 0, so we just want the probability that
# Y equals 1.
y_predict_proba = y_predict_proba_raw[:, 1]
# Scale the probabilities to predictions of 1 or 0, based on
# a threshold of 50%.
y_predict = [0 if y < 0.5 else 1 for y in y_predict_proba]
def get_confusion_matrix_stats(y_test, y_predict):
# Get the confusion matrix and calculate the results.
matrix = confusion_matrix(y_test, y_predict)
n_samples = float(len(y_test))
accuracy = float(matrix[0][0] + matrix[1][1]) / n_samples
precision = matrix[1][1] / float(matrix[0][1] + matrix[1][1])
recall = matrix[1][1] / float(matrix[1][0] + matrix[1][1])
print('Accuracy: %.2f' % accuracy)
print('Precision: %.2f' % precision)
print('Recall: %.2f' % recall)
# Usage:
# The target feature is whether or not the employee left.
TARGET_FEATURE = 'left'
TEST_SET_SIZE = 0.2
CATEGORICAL_FEATURES = ['sales', 'salary']
raw_data = load_data('./HR_comma_sep.csv')
# It's helpful to take a quick look at the data.
show_samples(raw_data)
# Preprocess
x_data, y_data = get_usable_data(raw_data, categorical_features=CATEGORICAL_FEATURES)
# Good to eye-ball the processed data too
show_samples(raw_data)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SET_SIZE)
model = LogisticRegression().fit(x_train, y_train)
predictions = evaluate(model, x_test)
get_confusion_matrix(y_test, predictions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment