Skip to content

Instantly share code, notes, and snippets.

@netskink
Created January 27, 2020 15:01
Show Gist options
  • Save netskink/078c114a2e4d1a73c853822487b9ebc0 to your computer and use it in GitHub Desktop.
Save netskink/078c114a2e4d1a73c853822487b9ebc0 to your computer and use it in GitHub Desktop.
# from
# https://www.tensorflow.org/beta/tutorials/estimators/boosted_trees
# TODO: 2.0 is out now
#!pip install tensorflow==2.0.0-rc1
#try:
# %tensorflow_version 2.x
#except Exception:
# pass
from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import tensorflow as tf
import platform
import pandas as pd
tf.random.set_seed(123)
print(platform.python_version())
print(tf.version.GIT_VERSION)
print(tf.version.VERSION)
print(tf.__version__)
# Load dataset.
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv')
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')
print(dftrain.shape[0])
print(dfeval.shape[0])
fc = tf.feature_column
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']
def one_hot_cat_column(feature_name, vocab):
return tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
# Need to one-hot encode categorical features.
vocabulary = dftrain[feature_name].unique()
feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
for feature_name in NUMERIC_COLUMNS:
feature_columns.append(tf.feature_column.numeric_column(feature_name,
dtype=tf.float32))
example = dict(dftrain.head(1))
print(example)
class_fc = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
'class',
('First', 'Second', 'Third')))
print(class_fc)
print('Feature value: \"{}\"'.format(example['class'].iloc[0]))
print('One-hot encoded: ', tf.keras.layers.DenseFeatures([class_fc])(example).numpy())
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
if shuffle:
dataset = dataset.shuffle(NUM_EXAMPLES)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = dataset.repeat(n_epochs)
# In memory training doesn't use batching.
dataset = dataset.batch(NUM_EXAMPLES)
return dataset
return input_fn
# Training and evaluation input functions.
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)
# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset.
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
n_batches_per_layer=n_batches)
# The model will stop training once the specified number of trees is built, not
# based on the number of steps.
##########
# In jupyter this will crash the kernel
############
est.train(train_input_fn, max_steps=100)
# EVAL
result = est.evalute(eval_input_fn)
print(pd.Series(result))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment