Forked from sjtalkar/gist:6857c1efa51217a08bbb8d1808ebed30
Created
November 16, 2022 15:09
-
-
Save maria-aguilera/bf568347e53dd6703319d72542f05142 to your computer and use it in GitHub Desktop.
Altair functions for EDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import altair as alt | |
#Simple Altait plot | |
import altair as alt | |
def plotGenericLineChart(x_values, x_type , y_values, y_type, chart_title, x_axis_title, y_axis_title): | |
""" | |
This function creates a line chart for two values X and Y passed in along with thier types and titles | |
""" | |
df = pd.DataFrame({"x_values" : x_values, 'y_values': y_values}) | |
chart = alt.Chart(df).mark_line( | |
).encode( | |
x = alt.X(f"x_values:{x_type}", title = x_axis_title), | |
y = alt.Y(f"y_values:{y_type}", title = y_axis_title), | |
).properties(width=600, height=400) | |
return chart | |
def plotDistributionForColumn(df, colName): | |
""" | |
This function plots a histogram for a column of the df dataframe | |
At the botton of the histogram, it plots a boxplot | |
""" | |
source = df[[colName]] | |
base = alt.Chart(source, title=colName) | |
B = base.mark_bar(color='green').encode( | |
alt.X(f"{colName}:Q", bin=alt.Bin(maxbins=100)), | |
y='count()', | |
) | |
mean_rule = alt.Chart(source).mark_rule(color='red').encode( | |
x=f'mean({colName}):Q' | |
) | |
median_rule = alt.Chart(source).mark_rule(color='orange').encode( | |
x=f'median({colName}):Q' | |
) | |
box_plot = base.mark_boxplot().encode( | |
x=f'{colName}:Q', | |
) | |
return B + mean_rule + median_rule & box_plot | |
# Call the function in a cell using IPython display to chart all the columns at the same time | |
from IPython.display import display | |
for col in numeric_features + ['rentals']: | |
chart = plotDistributionForColumn(bike_data, col) | |
display(chart) | |
def plotCategoricalfeatures(df, colName): | |
""" | |
This function plots a br graph for a column of the df dataframe. | |
The type of the column is categorical | |
At the botton of the histogram, it plots a boxplot | |
""" | |
source = df[[colName]] | |
base = alt.Chart(source, title=colName) | |
B = base.mark_bar(color='#20b2aa').encode( | |
alt.X(f"{colName}:N"), | |
y='count()', | |
) | |
return B | |
def createCorrelationScatters(df, colName, targetName): | |
""" | |
This function created a scatter chart and a line that is a regression line between the two numerical columns passed to it | |
It prints out the correlation values as well | |
""" | |
corr = df[targetName].corr(df[colName]) | |
source = df | |
base = alt.Chart(source) | |
chart = base.mark_circle().encode( | |
alt.X(f"{colName}:Q"), | |
alt.Y(f"{targetName}:Q"), | |
color=alt.value("orange") | |
).properties( | |
width=300, | |
height=150 | |
) | |
text = base.mark_text( | |
align="left", baseline="top" | |
).encode( | |
x=alt.value(5), # pixels from left | |
y=alt.value(5), # pixels from top | |
text=alt.value(f"corr: {corr:.3f}"), | |
) | |
return chart + text+ chart.transform_regression(colName, targetName).mark_line(color='darkblue').encode(color=alt.value('blue')) | |
### Create box plots of the target - facet by values of the categorical values | |
def createBarsForTargetPerCategory(df, targetName, catColName ): | |
source = bike_data | |
base = alt.Chart(source) | |
box_plot = base.mark_boxplot().encode( | |
y=f'{targetName}:Q', | |
).facet( | |
column=f'{catColName}:N' | |
) | |
return box_plot | |
targetName = 'rentals' | |
for catColName in categorical_features: | |
chart = createBarsForTargetPerCategory(bike_data, targetName, catColName) | |
display(chart) | |
def compareTrueAndPredictedLabels(y_true, y_pred, degree = 1): | |
""" | |
This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a continuous variable | |
Note that - | |
R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line. This is based on the total variation of prediction explained by the model. | |
""" | |
data = {'y_true':y_test, 'y_pred':y_pred} | |
df = pd.DataFrame(data=data) | |
# Evaluate the model using the test data | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = r2_score(y_test, y_pred) | |
FIG_WIDTH = 500 | |
FIG_HEIGHT = 400 | |
base = alt.Chart( | |
df, title = 'Daily Bike Share Predictions' | |
).properties( | |
height = FIG_HEIGHT, | |
width = FIG_WIDTH | |
) | |
s = base.mark_circle( | |
).encode( | |
x = alt.X('y_true:Q',title = 'Actual Labels'), | |
y = alt.Y('y_pred:Q', title = 'Predicted Labels'), | |
color = alt.value('red') | |
) | |
# create a linear relation (polyfit with degree 1) betwen true and predicted values | |
z = np.polyfit(y_test, y_pred, 1) | |
p = np.poly1d(z) | |
reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)}) | |
l = alt.Chart(reg_line).mark_line( | |
).encode( | |
x = 'y_true:Q', | |
y = 'y_eval:Q' | |
) | |
text = base.mark_text( | |
align="right", baseline="bottom" | |
).encode( | |
x=alt.value(FIG_WIDTH -100), # pixels from left | |
y=alt.value(FIG_HEIGHT-10), # pixels from top | |
text=alt.value(f"corr: {mse:.3f} rmse: {rmse:0.3f} r2: {r2:0.3f} "), | |
) | |
return s + l + text | |
################################################################################################################################### | |
##################Assessing models for baseline#################################################################################### | |
def compareTrueAndPredictedLabels(y_true, y_pred, chart_title, degree = 1): | |
""" | |
This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a | |
continuous variable. | |
Note that - | |
R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line. | |
This is based on the total variation of prediction explained by the model. | |
Inputs: y_true - the Laveled y test values | |
y_pred - predicted values for (baseline?) model | |
chart_title - a LIST of strings that is the (baseline?) model's parameters | |
""" | |
data = {'y_true':y_test, 'y_pred':y_pred} | |
df = pd.DataFrame(data=data) | |
# Evaluate the model using the test data | |
mse = mean_squared_error(y_test, y_pred) | |
rmse = np.sqrt(mse) | |
r2 = r2_score(y_test, y_pred) | |
FIG_WIDTH = 300 | |
FIG_HEIGHT = 200 | |
base = alt.Chart( | |
df, title = chart_title | |
).properties( | |
height = FIG_HEIGHT, | |
width = FIG_WIDTH | |
) | |
s = base.mark_circle( | |
).encode( | |
x = alt.X('y_true:Q',title = 'Actual Labels'), | |
y = alt.Y('y_pred:Q', title = 'Predicted Labels'), | |
color = alt.value('red') | |
) | |
# create a linear relation (polyfit with degree 1) betwen true and predicted values | |
z = np.polyfit(y_test, y_pred, 1) | |
p = np.poly1d(z) | |
reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)}) | |
l = alt.Chart(reg_line).mark_line( | |
).encode( | |
x = 'y_true:Q', | |
y = 'y_eval:Q' | |
) | |
text = base.mark_text( | |
align="right", baseline="bottom", lineBreak='\n' | |
).encode( | |
x=alt.value(FIG_WIDTH -100), # pixels from left | |
y=alt.value(FIG_HEIGHT-30), # pixels from top | |
text=alt.value(f"corr: {mse:.3f}\n rmse: {rmse:0.3f}\n r2: {r2:0.3f}\n "), | |
) | |
return s + l + text | |
# Find scores for a vriety of models with default parameters | |
from sklearn.linear_model import LinearRegression | |
from sklearn.linear_model import Lasso | |
from sklearn.tree import DecisionTreeRegressor | |
from sklearn.ensemble import RandomForestRegressor | |
from sklearn.ensemble import GradientBoostingRegressor | |
from sklearn.dummy import DummyRegressor | |
def evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test): | |
""" | |
reg_algos - a list of regression algortithms sent in for prediction and evaluation of the model | |
""" | |
for algo in reg_algos: | |
model = algo.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(',')) | |
display(chart.configure_title(fontSize=8)) | |
#### Create and compare models with default parameters | |
reg_algos = [] | |
reg_algos.append(DummyRegressor()) | |
reg_algos.append(LinearRegression()) | |
reg_algos.append(DecisionTreeRegressor()) | |
evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test) | |
################################################################################################################################ | |
##################USING PIPELINE TO CREATE COLUMN TRANSFORMATION AND THEN RUN A REGRESSION MODEL################################ | |
def evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo): | |
""" | |
pipeline is a pipeline of transformations, imputations and the algorith itself that can be fit on the training set. | |
algo is the algorith in the pipeline | |
""" | |
model = pipeline.fit(X_train, y_train) | |
y_pred = model.predict(X_test) | |
chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(',')) | |
display(chart.configure_title(fontSize=8)) | |
return model | |
from sklearn.compose import make_column_transformer | |
from sklearn.pipeline import make_pipeline | |
def createPipelineAndEvaluate(df, not_features_cols, features_cols,): | |
"""" | |
This function identifies the numeric (continuous values) columns and discrete value columns | |
It then transofrms teh columns using StatndardScaler and OneHotEncoding respectively | |
Then a pipeline is created using make_pipeline to send the transofrmed dataset to a ML algorithm | |
""" | |
numeric_columns = df.select_dtypes('float').columns.values | |
categorical_columns = df.select_dtypes('int').columns.values | |
# remove the columns that are not features from the above lists | |
categorical_columns = np.setdiff1d(categorical_columns, np.array(not_features_cols)) | |
numeric_columns = np.setdiff1d(numeric_columns, np.array(not_features_cols)) | |
# These are the columns of our training array | |
X_cols = features_cols | |
# find the numerical indexes of the column in the ndarray corresponding to floating columns that have to be scaled and the categorical columns that have to be one hot encoded | |
num_col_indexes = [i for num_col in numeric_columns for i,x in enumerate(X_cols) if x == num_col] | |
cat_col_indexes = [i for num_col in categorical_columns for i,x in enumerate(X_cols) if x == num_col] | |
#define the scaling for the numeric columns | |
# specify the columns that will go through each transformation | |
# Combine preprocessing steps | |
preprocessor = make_column_transformer( | |
(StandardScaler(), numeric_features), | |
(OneHotEncoder(handle_unknown='ignore'), categorical_features)) | |
algo = GradientBoostingRegressor() | |
# Create preprocessing and training pipeline | |
pipeline = make_pipeline( preprocessor, algo) | |
# fit the pipeline to train an algorithm to create a model on the training set | |
model = evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo) | |
return model | |
########################################################################################################################### | |
################### Call the above functions | |
########################################################################################################################### | |
not_features_cols = ['instant', 'dteday', 'yr', 'day', 'rentals'] | |
features_cols = ['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed'] | |
model = createPipelineAndEvaluate(bike_data, not_features_cols, features_cols,) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment