Skip to content

Instantly share code, notes, and snippets.

View JCardenasRdz's full-sized avatar

Julio Cárdenas-Rodríguez JCardenasRdz

  • @DataTranslators
  • Tucson
View GitHub Profile
# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#%%
from sklearn import metrics
import pandas as pd
# -*- coding: utf-8 -*-
"""
@author: Julio Cardenas-Rodriguez, Ph.D.
jdatascientist@gmail.com
Objective
- Describe how to :
1) Use pipelines and feature union in Python
2) Include cleanining as part of the hyperparameter tunning
from sklearn import metrics
# TESTING
print('---'*50)
print( ' The r-squared is :', metrics.regression.r2_score(y_test, model.predict(X_test) ) )
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# final pipeline
pipeline = Pipeline([("data_cleaner", data_cleaner), ("LR", LinearRegression() )])
# TRAINING
# parameter grid
param_grid = dict( data_cleaner__cont_cleaner__center = [True,False] ,
#%% 4 create final data cleaner
from sklearn.pipeline import FeatureUnion
#allocate
data_cleaner = FeatureUnion( [ cont_data_cleaner, cat_data_cleaner ])
#train
data_cleaner.fit(X_train)
def create_manufacturer(DataFrame, N_cases = 3):
"""
return the top_N + 1 manufacturers in one-hot encoding format
"""
man = DataFrame['name'].apply(lambda x: x.split(' ')[0])
cases = man.value_counts().index
manufacturer_encoded = man.replace(cases[N_cases::],
len(cases[N_cases::]) * ['OTHER_MAN'])
#%% 2. Create pipeline to clean contnous data
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
def center_and_scale_data(DataFrame):
"""
this function does the actual center and scaling
"""
D = DataFrame.copy()
for C in D.columns:
# load libraries and data
from sklearn.model_selection import train_test_split as tts
from seaborn import load_dataset
data = load_dataset('mpg').dropna()
# split data
Xdata = data.drop('mpg',axis = 1)
Ydata = data.mpg.values
X_train, X_test, y_train, y_test = tts( Xdata, Ydata, test_size=0.33, random_state=42)
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
D= load_boston()
R1, ytest1, yhat1 = train_GradientBoostingRegressor(D.data, D.target, loss='quantile', alpha = 0.50, n_steps = 50)
R2, ytest2, yhat2 = train_GradientBoostingRegressor(D.data, D.target, loss='ls' , n_steps = 50)
@JCardenasRdz
JCardenasRdz / train_GradientBoostingRegressor.py
Last active July 26, 2020 20:46
2017-11-10-gradient_boosting_hyperopt
#modules
from sklearn.metrics.regression import mean_absolute_error as mae
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from hyperopt import hp, fmin, tpe
from hyperopt.pyll import scope
import numpy as np
# hyperopt object for