blakewest · February 5, 2018 22:38
diff --git a/logistic_regression_example.py b/logistic_regression_example.py
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import confusion_matrix
 from sklearn.model_selection import train_test_split

 def load_data(path):
  # Load the data into a Pandas dataframe.
  raw_data = pd.read_csv(path, header=0)  
  print('Loaded data from', path)
  return raw_data

 def show_samples(data, num_samples=5):
    print(raw_data[:num_samples])
    print('')
    print('Count by target feature:')
    print(raw_data[TARGET_FEATURE].value_counts())
    print('')
    
 def get_usable_data(data, add_intercept=True, categorical_features=[]):
  # Separate the X and Y values.
  y_data = raw_data[TARGET_FEATURE]
  x_data = raw_data.drop(TARGET_FEATURE, axis=1)

  # To include an intercept, add a new column with a constant.
  if add_intercept:
    x_data['intercept'] = 1.0

  for feature in categorical_features:
    dummy_data = pd.get_dummies(x_data[feature], prefix=feature)

    # We need to remove at least one of the dummy features.
    # It doesn't matter which one. To stay consistent we
    # usually remove the most common value.
    most_common_value = pd.value_counts(x_data[feature]).index[0]
    dummy_to_exclude = feature + '_' + str(most_common_value)
    dummy_data_to_use = dummy_data.drop(dummy_to_exclude, axis=1)
    x_data[dummy_data_to_use.columns] = dummy_data_to_use

    # Remove the original feature, so just the dummy data remains.
    x_data = x_data.drop(feature, axis=1)
  
  return x_data, y_data

 def evaluate(model, x_test):
  # Get prediction probabilities for the test set.
  y_predict_proba_raw = model.predict_proba(x_test)

  # The .predict_proba() function returns the probabilities of Y
  # equaling both 1 and 0, so we just want the probability that
  # Y equals 1.
  y_predict_proba = y_predict_proba_raw[:, 1]

  # Scale the probabilities to predictions of 1 or 0, based on
  # a threshold of 50%.
  y_predict = [0 if y < 0.5 else 1 for y in y_predict_proba]
 
 def get_confusion_matrix_stats(y_test, y_predict):
  # Get the confusion matrix and calculate the results.
  matrix = confusion_matrix(y_test, y_predict)
  n_samples = float(len(y_test))
  accuracy = float(matrix[0][0] + matrix[1][1]) / n_samples
  precision = matrix[1][1] / float(matrix[0][1] + matrix[1][1])
  recall    = matrix[1][1] / float(matrix[1][0] + matrix[1][1])
  print('Accuracy: %.2f'  % accuracy)
  print('Precision: %.2f' % precision)
  print('Recall: %.2f'    % recall)


 # Usage:

 # The target feature is whether or not the employee left.
 TARGET_FEATURE = 'left'
 TEST_SET_SIZE = 0.2
 CATEGORICAL_FEATURES = ['sales', 'salary']

 raw_data = load_data('./HR_comma_sep.csv')
 # It's helpful to take a quick look at the data.
 show_samples(raw_data)

 # Preprocess
 x_data, y_data = get_usable_data(raw_data, categorical_features=CATEGORICAL_FEATURES)
 # Good to eye-ball the processed data too
 show_samples(raw_data)

 x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SET_SIZE)
 model = LogisticRegression().fit(x_train, y_train)
 predictions = evaluate(model, x_test)
 get_confusion_matrix(y_test, predictions)
	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import confusion_matrix
	from sklearn.model_selection import train_test_split

	def load_data(path):
	# Load the data into a Pandas dataframe.
	raw_data = pd.read_csv(path, header=0)
	print('Loaded data from', path)
	return raw_data

	def show_samples(data, num_samples=5):
	print(raw_data[:num_samples])
	print('')
	print('Count by target feature:')
	print(raw_data[TARGET_FEATURE].value_counts())
	print('')

	def get_usable_data(data, add_intercept=True, categorical_features=[]):
	# Separate the X and Y values.
	y_data = raw_data[TARGET_FEATURE]
	x_data = raw_data.drop(TARGET_FEATURE, axis=1)

	# To include an intercept, add a new column with a constant.
	if add_intercept:
	x_data['intercept'] = 1.0

	for feature in categorical_features:
	dummy_data = pd.get_dummies(x_data[feature], prefix=feature)

	# We need to remove at least one of the dummy features.
	# It doesn't matter which one. To stay consistent we
	# usually remove the most common value.
	most_common_value = pd.value_counts(x_data[feature]).index[0]
	dummy_to_exclude = feature + '_' + str(most_common_value)
	dummy_data_to_use = dummy_data.drop(dummy_to_exclude, axis=1)
	x_data[dummy_data_to_use.columns] = dummy_data_to_use

	# Remove the original feature, so just the dummy data remains.
	x_data = x_data.drop(feature, axis=1)

	return x_data, y_data

	def evaluate(model, x_test):
	# Get prediction probabilities for the test set.
	y_predict_proba_raw = model.predict_proba(x_test)

	# The .predict_proba() function returns the probabilities of Y
	# equaling both 1 and 0, so we just want the probability that
	# Y equals 1.
	y_predict_proba = y_predict_proba_raw[:, 1]

	# Scale the probabilities to predictions of 1 or 0, based on
	# a threshold of 50%.
	y_predict = [0 if y < 0.5 else 1 for y in y_predict_proba]

	def get_confusion_matrix_stats(y_test, y_predict):
	# Get the confusion matrix and calculate the results.
	matrix = confusion_matrix(y_test, y_predict)
	n_samples = float(len(y_test))
	accuracy = float(matrix[0][0] + matrix[1][1]) / n_samples
	precision = matrix[1][1] / float(matrix[0][1] + matrix[1][1])
	recall = matrix[1][1] / float(matrix[1][0] + matrix[1][1])
	print('Accuracy: %.2f' % accuracy)
	print('Precision: %.2f' % precision)
	print('Recall: %.2f' % recall)


	# Usage:

	# The target feature is whether or not the employee left.
	TARGET_FEATURE = 'left'
	TEST_SET_SIZE = 0.2
	CATEGORICAL_FEATURES = ['sales', 'salary']

	raw_data = load_data('./HR_comma_sep.csv')
	# It's helpful to take a quick look at the data.
	show_samples(raw_data)

	# Preprocess
	x_data, y_data = get_usable_data(raw_data, categorical_features=CATEGORICAL_FEATURES)
	# Good to eye-ball the processed data too
	show_samples(raw_data)

	x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SET_SIZE)
	model = LogisticRegression().fit(x_train, y_train)
	predictions = evaluate(model, x_test)
	get_confusion_matrix(y_test, predictions)