Created
January 27, 2020 15:01
-
-
Save netskink/078c114a2e4d1a73c853822487b9ebc0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# from | |
# https://www.tensorflow.org/beta/tutorials/estimators/boosted_trees | |
# TODO: 2.0 is out now | |
#!pip install tensorflow==2.0.0-rc1 | |
#try: | |
# %tensorflow_version 2.x | |
#except Exception: | |
# pass | |
from __future__ import absolute_import, division, print_function, unicode_literals | |
import numpy as np | |
import tensorflow as tf | |
import platform | |
import pandas as pd | |
tf.random.set_seed(123) | |
print(platform.python_version()) | |
print(tf.version.GIT_VERSION) | |
print(tf.version.VERSION) | |
print(tf.__version__) | |
# Load dataset. | |
dftrain = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/train.csv') | |
dfeval = pd.read_csv('https://storage.googleapis.com/tf-datasets/titanic/eval.csv') | |
y_train = dftrain.pop('survived') | |
y_eval = dfeval.pop('survived') | |
print(dftrain.shape[0]) | |
print(dfeval.shape[0]) | |
fc = tf.feature_column | |
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone'] | |
NUMERIC_COLUMNS = ['age', 'fare'] | |
def one_hot_cat_column(feature_name, vocab): | |
return tf.feature_column.indicator_column( | |
tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab)) | |
feature_columns = [] | |
for feature_name in CATEGORICAL_COLUMNS: | |
# Need to one-hot encode categorical features. | |
vocabulary = dftrain[feature_name].unique() | |
feature_columns.append(one_hot_cat_column(feature_name, vocabulary)) | |
for feature_name in NUMERIC_COLUMNS: | |
feature_columns.append(tf.feature_column.numeric_column(feature_name, | |
dtype=tf.float32)) | |
example = dict(dftrain.head(1)) | |
print(example) | |
class_fc = tf.feature_column.indicator_column( | |
tf.feature_column.categorical_column_with_vocabulary_list( | |
'class', | |
('First', 'Second', 'Third'))) | |
print(class_fc) | |
print('Feature value: \"{}\"'.format(example['class'].iloc[0])) | |
print('One-hot encoded: ', tf.keras.layers.DenseFeatures([class_fc])(example).numpy()) | |
# Use entire batch since this is such a small dataset. | |
NUM_EXAMPLES = len(y_train) | |
def make_input_fn(X, y, n_epochs=None, shuffle=True): | |
def input_fn(): | |
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)) | |
if shuffle: | |
dataset = dataset.shuffle(NUM_EXAMPLES) | |
# For training, cycle thru dataset as many times as need (n_epochs=None). | |
dataset = dataset.repeat(n_epochs) | |
# In memory training doesn't use batching. | |
dataset = dataset.batch(NUM_EXAMPLES) | |
return dataset | |
return input_fn | |
# Training and evaluation input functions. | |
train_input_fn = make_input_fn(dftrain, y_train) | |
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1) | |
# Since data fits into memory, use entire dataset per layer. It will be faster. | |
# Above one batch is defined as the entire dataset. | |
n_batches = 1 | |
est = tf.estimator.BoostedTreesClassifier(feature_columns, | |
n_batches_per_layer=n_batches) | |
# The model will stop training once the specified number of trees is built, not | |
# based on the number of steps. | |
########## | |
# In jupyter this will crash the kernel | |
############ | |
est.train(train_input_fn, max_steps=100) | |
# EVAL | |
result = est.evalute(eval_input_fn) | |
print(pd.Series(result)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment