Created
September 8, 2022 05:58
-
-
Save fernandojunior/4cd5055fe91a69d80b35108e1b04af53 to your computer and use it in GitHub Desktop.
Use OOP and functional programming to create data pipelines with sklearn, classes and pure functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Any, Callable | |
from dataclasses import dataclass | |
import numpy as np | |
import pandas as pd | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.preprocessing import FunctionTransformer | |
class DataframeTransformer(): | |
def __init__(self, func: Callable): | |
self.func = func | |
def transform(self, input_df: pd.DataFrame, **transform_params) -> pd.DataFrame: | |
return self.func(input_df) | |
def fit(self, X, y=None, **fit_params): | |
return self | |
def custom_scaling(df: pd.DataFrame) -> Any: | |
return (df-df.min())/(df.max()-df.min()) | |
@dataclass | |
class Analyze(): | |
input_path: str | |
output_path: str | |
def create_pipe(self): | |
return Pipeline([ | |
("step1", SimpleImputer(strategy="median")), | |
("step2", DataframeTransformer(custom_scaling)), | |
# ("step2", MinMaxScaler()), | |
("step3", FunctionTransformer(np.log1p)) | |
]) | |
def run(self): | |
df = pd.read_csv(self.input_path) | |
df.loc[:,:] = self.create_pipe().fit_transform(df) | |
df.to_csv(self.output_path) | |
analyze = Analyze( | |
input_path="path/to/input/data", | |
output_path="path/to/output/data" | |
) | |
analyze.run() |
Author
fernandojunior
commented
Sep 8, 2022
- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
- https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
- https://queirozf.com/entries/scikit-learn-pipelines-custom-pipelines-and-pandas-integration
- https://stackoverflow.com/questions/26414913/normalize-columns-of-a-dataframe
- https://www.andrewvillazon.com/custom-scikit-learn-transformers/
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment