Created
March 28, 2024 19:04
-
-
Save gregsheremeta/12ecf439063f85c318403e1a5c987352 to your computer and use it in GitHub Desktop.
iris-training-pipeline.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
from kfp import client | |
from kfp import compiler | |
from kfp import dsl | |
from kfp.dsl import Dataset | |
from kfp.dsl import Input | |
from kfp.dsl import Model | |
from kfp.dsl import Output | |
@dsl.component(base_image="docker.io/python:3.9.17", | |
packages_to_install=['pandas==2.2.0']) | |
def create_dataset(iris_dataset: Output[Dataset]): | |
import pandas as pd | |
csv_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data' | |
col_names = [ | |
'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Labels' | |
] | |
df = pd.read_csv(csv_url, names=col_names) | |
with open(iris_dataset.path, 'w') as f: | |
df.to_csv(f) | |
@dsl.component(base_image="docker.io/python:3.9.17", | |
packages_to_install=['pandas==2.2.0', 'scikit-learn==1.4.0']) | |
def normalize_dataset( | |
input_iris_dataset: Input[Dataset], | |
normalized_iris_dataset: Output[Dataset], | |
standard_scaler: bool, | |
min_max_scaler: bool, | |
): | |
if standard_scaler is min_max_scaler: | |
raise ValueError( | |
'Exactly one of standard_scaler or min_max_scaler must be True.') | |
import pandas as pd | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.preprocessing import StandardScaler | |
with open(input_iris_dataset.path) as f: | |
df = pd.read_csv(f) | |
labels = df.pop('Labels') | |
if standard_scaler: | |
scaler = StandardScaler() | |
if min_max_scaler: | |
scaler = MinMaxScaler() | |
df = pd.DataFrame(scaler.fit_transform(df)) | |
df['Labels'] = labels | |
normalized_iris_dataset.metadata['state'] = "Normalized" | |
with open(normalized_iris_dataset.path, 'w') as f: | |
df.to_csv(f) | |
@dsl.component(base_image="docker.io/python:3.9.17", | |
packages_to_install=['pandas==2.2.0', 'scikit-learn==1.4.0']) | |
def train_model( | |
normalized_iris_dataset: Input[Dataset], | |
model: Output[Model], | |
n_neighbors: int, | |
): | |
import pickle | |
import pandas as pd | |
from sklearn.model_selection import train_test_split | |
from sklearn.neighbors import KNeighborsClassifier | |
with open(normalized_iris_dataset.path) as f: | |
df = pd.read_csv(f) | |
y = df.pop('Labels') | |
X = df | |
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) | |
clf = KNeighborsClassifier(n_neighbors=n_neighbors) | |
clf.fit(X_train, y_train) | |
model.metadata['framework'] = 'scikit-learn' | |
with open(model.path, 'wb') as f: | |
pickle.dump(clf, f) | |
@dsl.pipeline(name='iris-training-pipeline') | |
def my_pipeline( | |
standard_scaler: bool, | |
min_max_scaler: bool, | |
neighbors: int, | |
): | |
create_dataset_task = create_dataset() | |
normalize_dataset_task = normalize_dataset( | |
input_iris_dataset=create_dataset_task.outputs['iris_dataset'], | |
standard_scaler=True, | |
min_max_scaler=False) | |
train_model( | |
normalized_iris_dataset=normalize_dataset_task | |
.outputs['normalized_iris_dataset'], | |
n_neighbors=neighbors) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment