Created
May 5, 2020 09:35
-
-
Save csiebler/50a1192012a81d58f145ac65feef8858 to your computer and use it in GitHub Desktop.
A short example for train.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import argparse | |
import joblib | |
import pandas as pd | |
from azureml.core import Run | |
from azureml.core.run import Run | |
from sklearn.compose import ColumnTransformer | |
from sklearn.impute import SimpleImputer | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.preprocessing import StandardScaler | |
# All the imports go here... | |
def getRuntimeArgs(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--data-path', type=str) # Make sure the data path can be passed in dynamically | |
# Add more arguments here | |
args = parser.parse_args() | |
return args | |
def main(): | |
# Get arguments from command line | |
args = getRuntimeArgs() | |
# Get the current experiment run context from Azure Machine Learning | |
run = Run.get_context() | |
# Use whatever function you require to load the data from the data directory | |
credit_data_df = pd.read_csv(os.path.join(args.data_path, 'german_credit_data.csv')) | |
# Do some model training | |
clf = model_train(credit_data_df, run) | |
#copying to "outputs" directory, automatically uploads it to Azure ML | |
output_dir = './outputs/' | |
os.makedirs(output_dir, exist_ok=True) | |
joblib.dump(value=clf, filename=os.path.join(output_dir, 'model.pkl')) | |
# Do your training here | |
def model_train(ds_df, run): | |
ds_df.drop("Sno", axis=1, inplace=True) | |
y_raw = ds_df['Risk'] | |
X_raw = ds_df.drop('Risk', axis=1) | |
categorical_features = X_raw.select_dtypes(include=['object']).columns | |
numeric_features = X_raw.select_dtypes(include=['int64', 'float']).columns | |
categorical_transformer = Pipeline(steps=[ | |
('imputer', SimpleImputer(strategy='constant', fill_value="missing")), | |
('onehotencoder', OneHotEncoder(categories='auto', sparse=False))]) | |
numeric_transformer = Pipeline(steps=[ | |
('scaler', StandardScaler())]) | |
feature_engineering_pipeline = ColumnTransformer( | |
transformers=[ | |
('numeric', numeric_transformer, numeric_features), | |
('categorical', categorical_transformer, categorical_features) | |
], remainder="drop") | |
# Encode Labels | |
le = LabelEncoder() | |
encoded_y = le.fit_transform(y_raw) | |
# Train test split | |
X_train, X_test, y_train, y_test = train_test_split(X_raw, encoded_y, test_size=0.20, stratify=encoded_y, random_state=42) | |
# Create sklearn pipeline | |
lr_clf = Pipeline(steps=[('preprocessor', feature_engineering_pipeline), | |
('classifier', LogisticRegression(solver="lbfgs"))]) | |
# Train the model | |
lr_clf.fit(X_train, y_train) | |
# Capture metrics | |
train_acc = lr_clf.score(X_train, y_train) | |
test_acc = lr_clf.score(X_test, y_test) | |
print("Training accuracy: %.3f" % train_acc) | |
print("Test data accuracy: %.3f" % test_acc) | |
# Logmetrics to Azure ML | |
run.log('Train accuracy', train_acc) | |
run.log('Test accuracy', test_acc) | |
return lr_clf | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment