Created
June 20, 2019 08:05
-
-
Save jpbarto/80807ca9c84057af561af19ce433a56b to your computer and use it in GitHub Desktop.
Simple SKLearn script to transform some inputs using the SKLearn Estimator on Amazon SageMaker
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import argparse | |
from sklearn import preprocessing | |
import pickle | |
import os | |
from io import StringIO | |
from sagemaker_containers.beta.framework import ( | |
content_types, encoders, env, modules, transformer, worker) | |
feature_columns_names = [ | |
'sex', # M, F, and I (infant) | |
'length', # Longest shell measurement | |
'diameter', # perpendicular to length | |
'height', # with meat in shell | |
'whole_weight', # whole abalone | |
'shucked_weight', # weight of meat | |
'viscera_weight', # gut weight (after bleeding) | |
'shell_weight'] # after being dried | |
label_column = 'rings' | |
feature_columns_dtype = { | |
'sex': str, | |
'length': np.float64, | |
'diameter': np.float64, | |
'height': np.float64, | |
'whole_weight': np.float64, | |
'shucked_weight': np.float64, | |
'viscera_weight': np.float64, | |
'shell_weight': np.float64} | |
label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years | |
def merge_two_dicts(x, y): | |
z = x.copy() # start with x's keys and values | |
z.update(y) # modifies z with y's keys and values & returns None | |
return z | |
num_features = list(feature_columns_names) | |
num_features.remove('sex') | |
cat_features = ['sex'] | |
numeric_features = list(feature_columns_names) | |
numeric_features.remove('sex') | |
#### | |
# Batch transformation / inference code | |
#### | |
def model_fn (model_dir): | |
f = open (os.path.join(model_dir, "model.pkl"), 'rb') | |
[scaler, encoder] = pickle.load (f) | |
f.close () | |
return {'encoder': encoder, 'scaler': scaler} | |
def input_fn (input_data, content_type): | |
if content_type == 'text/csv': | |
# Read the raw input data as CSV. | |
df = pd.read_csv(StringIO(input_data), header=None) | |
df.columns=feature_columns_names | |
# assumes no labels, headers or unanticipated values are provided | |
print ("Processing CSV input") | |
print (df.info ()) | |
return df | |
else: | |
raise ValueError("{} not supported by script!".format(content_type)) | |
return input_data | |
def predict_fn (input_data, model): | |
scaler = model['scaler'] | |
encoder = model['encoder'] | |
scaled_data = scaler.transform (input_data[numeric_features]) | |
df_scaled = pd.DataFrame (scaled_data, columns=numeric_features) | |
encoded_data = encoder.transform (input_data['sex']) | |
df_scaled['sex'] = encoded_data | |
print ("Processed records with shape {}".format (df_scaled.shape)) | |
return df_scaled | |
def output_fn (prediction, accept): | |
if accept == "application/json": | |
instances = [] | |
for row in prediction.values: | |
instances.append({"features": row}) | |
json_output = {"instances": instances} | |
return worker.Response(json.dumps(json_output), mimetype=accept) | |
elif accept == 'text/csv': | |
return worker.Response(encoders.encode(prediction, accept), mimetype=accept) | |
else: | |
raise RuntimeException("{} accept type is not supported by this script.".format(accept)) | |
#### | |
# Training job code | |
#### | |
# executed as __main__ if performing a training job | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
# Sagemaker specific arguments. Defaults are set in the environment variables. | |
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) | |
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) | |
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) | |
args = parser.parse_args() | |
print ("Training with args: {}".format (args)) | |
raw_data = pd.read_csv( | |
args.train +'/abalone.csv', | |
header=None, | |
names=feature_columns_names + [label_column], | |
dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)) | |
print ("Read abalone.csv") | |
print (raw_data.info ()) | |
# print (raw_data.shape) | |
# raw_data.head(3) | |
num_scaler = preprocessing.MinMaxScaler () | |
num_scaler.fit (raw_data[numeric_features]) | |
num_scaled = num_scaler.transform (raw_data[numeric_features]) | |
df_scaled = pd.DataFrame (num_scaled, columns=numeric_features) | |
# df_scaled.head(3) | |
cat_encoder = preprocessing.LabelEncoder () | |
cat_encoder.fit (raw_data['sex']) | |
df_encoded = cat_encoder.transform (raw_data[cat_features]) | |
df_scaled['sex'] = df_encoded | |
df_scaled = df_scaled[feature_columns_names] | |
# df_scaled.head(3) | |
f = open(args.model_dir + '/model.pkl', 'wb') | |
pickle.dump ([num_scaler, cat_encoder], f) | |
f.close () | |
print ("Trained encoder and saved as model.pkl") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment