Created
February 5, 2018 22:38
-
-
Save blakewest/c712e302bd0fe1d60d23bd26f7be40d0 to your computer and use it in GitHub Desktop.
A starter implementation for logistic regression.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.metrics import confusion_matrix | |
from sklearn.model_selection import train_test_split | |
def load_data(path): | |
# Load the data into a Pandas dataframe. | |
raw_data = pd.read_csv(path, header=0) | |
print('Loaded data from', path) | |
return raw_data | |
def show_samples(data, num_samples=5): | |
print(raw_data[:num_samples]) | |
print('') | |
print('Count by target feature:') | |
print(raw_data[TARGET_FEATURE].value_counts()) | |
print('') | |
def get_usable_data(data, add_intercept=True, categorical_features=[]): | |
# Separate the X and Y values. | |
y_data = raw_data[TARGET_FEATURE] | |
x_data = raw_data.drop(TARGET_FEATURE, axis=1) | |
# To include an intercept, add a new column with a constant. | |
if add_intercept: | |
x_data['intercept'] = 1.0 | |
for feature in categorical_features: | |
dummy_data = pd.get_dummies(x_data[feature], prefix=feature) | |
# We need to remove at least one of the dummy features. | |
# It doesn't matter which one. To stay consistent we | |
# usually remove the most common value. | |
most_common_value = pd.value_counts(x_data[feature]).index[0] | |
dummy_to_exclude = feature + '_' + str(most_common_value) | |
dummy_data_to_use = dummy_data.drop(dummy_to_exclude, axis=1) | |
x_data[dummy_data_to_use.columns] = dummy_data_to_use | |
# Remove the original feature, so just the dummy data remains. | |
x_data = x_data.drop(feature, axis=1) | |
return x_data, y_data | |
def evaluate(model, x_test): | |
# Get prediction probabilities for the test set. | |
y_predict_proba_raw = model.predict_proba(x_test) | |
# The .predict_proba() function returns the probabilities of Y | |
# equaling both 1 and 0, so we just want the probability that | |
# Y equals 1. | |
y_predict_proba = y_predict_proba_raw[:, 1] | |
# Scale the probabilities to predictions of 1 or 0, based on | |
# a threshold of 50%. | |
y_predict = [0 if y < 0.5 else 1 for y in y_predict_proba] | |
def get_confusion_matrix_stats(y_test, y_predict): | |
# Get the confusion matrix and calculate the results. | |
matrix = confusion_matrix(y_test, y_predict) | |
n_samples = float(len(y_test)) | |
accuracy = float(matrix[0][0] + matrix[1][1]) / n_samples | |
precision = matrix[1][1] / float(matrix[0][1] + matrix[1][1]) | |
recall = matrix[1][1] / float(matrix[1][0] + matrix[1][1]) | |
print('Accuracy: %.2f' % accuracy) | |
print('Precision: %.2f' % precision) | |
print('Recall: %.2f' % recall) | |
# Usage: | |
# The target feature is whether or not the employee left. | |
TARGET_FEATURE = 'left' | |
TEST_SET_SIZE = 0.2 | |
CATEGORICAL_FEATURES = ['sales', 'salary'] | |
raw_data = load_data('./HR_comma_sep.csv') | |
# It's helpful to take a quick look at the data. | |
show_samples(raw_data) | |
# Preprocess | |
x_data, y_data = get_usable_data(raw_data, categorical_features=CATEGORICAL_FEATURES) | |
# Good to eye-ball the processed data too | |
show_samples(raw_data) | |
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SET_SIZE) | |
model = LogisticRegression().fit(x_train, y_train) | |
predictions = evaluate(model, x_test) | |
get_confusion_matrix(y_test, predictions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment