Skip to content

Instantly share code, notes, and snippets.

@sjtalkar
Last active November 16, 2022 15:09
Show Gist options
  • Save sjtalkar/6857c1efa51217a08bbb8d1808ebed30 to your computer and use it in GitHub Desktop.
Save sjtalkar/6857c1efa51217a08bbb8d1808ebed30 to your computer and use it in GitHub Desktop.
Altair functions for EDA
import altair as alt
#Simple Altait plot
import altair as alt
def plotGenericLineChart(x_values, x_type , y_values, y_type, chart_title, x_axis_title, y_axis_title):
"""
This function creates a line chart for two values X and Y passed in along with thier types and titles
"""
df = pd.DataFrame({"x_values" : x_values, 'y_values': y_values})
chart = alt.Chart(df).mark_line(
).encode(
x = alt.X(f"x_values:{x_type}", title = x_axis_title),
y = alt.Y(f"y_values:{y_type}", title = y_axis_title),
).properties(width=600, height=400)
return chart
def plotDistributionForColumn(df, colName):
"""
This function plots a histogram for a column of the df dataframe
At the botton of the histogram, it plots a boxplot
"""
source = df[[colName]]
base = alt.Chart(source, title=colName)
B = base.mark_bar(color='green').encode(
alt.X(f"{colName}:Q", bin=alt.Bin(maxbins=100)),
y='count()',
)
mean_rule = alt.Chart(source).mark_rule(color='red').encode(
x=f'mean({colName}):Q'
)
median_rule = alt.Chart(source).mark_rule(color='orange').encode(
x=f'median({colName}):Q'
)
box_plot = base.mark_boxplot().encode(
x=f'{colName}:Q',
)
return B + mean_rule + median_rule & box_plot
# Call the function in a cell using IPython display to chart all the columns at the same time
from IPython.display import display
for col in numeric_features + ['rentals']:
chart = plotDistributionForColumn(bike_data, col)
display(chart)
def plotCategoricalfeatures(df, colName):
"""
This function plots a br graph for a column of the df dataframe.
The type of the column is categorical
At the botton of the histogram, it plots a boxplot
"""
source = df[[colName]]
base = alt.Chart(source, title=colName)
B = base.mark_bar(color='#20b2aa').encode(
alt.X(f"{colName}:N"),
y='count()',
)
return B
def createCorrelationScatters(df, colName, targetName):
"""
This function created a scatter chart and a line that is a regression line between the two numerical columns passed to it
It prints out the correlation values as well
"""
corr = df[targetName].corr(df[colName])
source = df
base = alt.Chart(source)
chart = base.mark_circle().encode(
alt.X(f"{colName}:Q"),
alt.Y(f"{targetName}:Q"),
color=alt.value("orange")
).properties(
width=300,
height=150
)
text = base.mark_text(
align="left", baseline="top"
).encode(
x=alt.value(5), # pixels from left
y=alt.value(5), # pixels from top
text=alt.value(f"corr: {corr:.3f}"),
)
return chart + text+ chart.transform_regression(colName, targetName).mark_line(color='darkblue').encode(color=alt.value('blue'))
### Create box plots of the target - facet by values of the categorical values
def createBarsForTargetPerCategory(df, targetName, catColName ):
source = bike_data
base = alt.Chart(source)
box_plot = base.mark_boxplot().encode(
y=f'{targetName}:Q',
).facet(
column=f'{catColName}:N'
)
return box_plot
targetName = 'rentals'
for catColName in categorical_features:
chart = createBarsForTargetPerCategory(bike_data, targetName, catColName)
display(chart)
def compareTrueAndPredictedLabels(y_true, y_pred, degree = 1):
"""
This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a continuous variable
Note that -
R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line. This is based on the total variation of prediction explained by the model.
"""
data = {'y_true':y_test, 'y_pred':y_pred}
df = pd.DataFrame(data=data)
# Evaluate the model using the test data
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
FIG_WIDTH = 500
FIG_HEIGHT = 400
base = alt.Chart(
df, title = 'Daily Bike Share Predictions'
).properties(
height = FIG_HEIGHT,
width = FIG_WIDTH
)
s = base.mark_circle(
).encode(
x = alt.X('y_true:Q',title = 'Actual Labels'),
y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
color = alt.value('red')
)
# create a linear relation (polyfit with degree 1) betwen true and predicted values
z = np.polyfit(y_test, y_pred, 1)
p = np.poly1d(z)
reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})
l = alt.Chart(reg_line).mark_line(
).encode(
x = 'y_true:Q',
y = 'y_eval:Q'
)
text = base.mark_text(
align="right", baseline="bottom"
).encode(
x=alt.value(FIG_WIDTH -100), # pixels from left
y=alt.value(FIG_HEIGHT-10), # pixels from top
text=alt.value(f"corr: {mse:.3f} rmse: {rmse:0.3f} r2: {r2:0.3f} "),
)
return s + l + text
###################################################################################################################################
##################Assessing models for baseline####################################################################################
def compareTrueAndPredictedLabels(y_true, y_pred, chart_title, degree = 1):
"""
This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a
continuous variable.
Note that -
R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line.
This is based on the total variation of prediction explained by the model.
Inputs: y_true - the Laveled y test values
y_pred - predicted values for (baseline?) model
chart_title - a LIST of strings that is the (baseline?) model's parameters
"""
data = {'y_true':y_test, 'y_pred':y_pred}
df = pd.DataFrame(data=data)
# Evaluate the model using the test data
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
FIG_WIDTH = 300
FIG_HEIGHT = 200
base = alt.Chart(
df, title = chart_title
).properties(
height = FIG_HEIGHT,
width = FIG_WIDTH
)
s = base.mark_circle(
).encode(
x = alt.X('y_true:Q',title = 'Actual Labels'),
y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
color = alt.value('red')
)
# create a linear relation (polyfit with degree 1) betwen true and predicted values
z = np.polyfit(y_test, y_pred, 1)
p = np.poly1d(z)
reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})
l = alt.Chart(reg_line).mark_line(
).encode(
x = 'y_true:Q',
y = 'y_eval:Q'
)
text = base.mark_text(
align="right", baseline="bottom", lineBreak='\n'
).encode(
x=alt.value(FIG_WIDTH -100), # pixels from left
y=alt.value(FIG_HEIGHT-30), # pixels from top
text=alt.value(f"corr: {mse:.3f}\n rmse: {rmse:0.3f}\n r2: {r2:0.3f}\n "),
)
return s + l + text
# Find scores for a vriety of models with default parameters
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
def evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test):
"""
reg_algos - a list of regression algortithms sent in for prediction and evaluation of the model
"""
for algo in reg_algos:
model = algo.fit(X_train, y_train)
y_pred = model.predict(X_test)
chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))
display(chart.configure_title(fontSize=8))
#### Create and compare models with default parameters
reg_algos = []
reg_algos.append(DummyRegressor())
reg_algos.append(LinearRegression())
reg_algos.append(DecisionTreeRegressor())
evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test)
################################################################################################################################
##################USING PIPELINE TO CREATE COLUMN TRANSFORMATION AND THEN RUN A REGRESSION MODEL################################
def evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo):
"""
pipeline is a pipeline of transformations, imputations and the algorith itself that can be fit on the training set.
algo is the algorith in the pipeline
"""
model = pipeline.fit(X_train, y_train)
y_pred = model.predict(X_test)
chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))
display(chart.configure_title(fontSize=8))
return model
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
def createPipelineAndEvaluate(df, not_features_cols, features_cols,):
""""
This function identifies the numeric (continuous values) columns and discrete value columns
It then transofrms teh columns using StatndardScaler and OneHotEncoding respectively
Then a pipeline is created using make_pipeline to send the transofrmed dataset to a ML algorithm
"""
numeric_columns = df.select_dtypes('float').columns.values
categorical_columns = df.select_dtypes('int').columns.values
# remove the columns that are not features from the above lists
categorical_columns = np.setdiff1d(categorical_columns, np.array(not_features_cols))
numeric_columns = np.setdiff1d(numeric_columns, np.array(not_features_cols))
# These are the columns of our training array
X_cols = features_cols
# find the numerical indexes of the column in the ndarray corresponding to floating columns that have to be scaled and the categorical columns that have to be one hot encoded
num_col_indexes = [i for num_col in numeric_columns for i,x in enumerate(X_cols) if x == num_col]
cat_col_indexes = [i for num_col in categorical_columns for i,x in enumerate(X_cols) if x == num_col]
#define the scaling for the numeric columns
# specify the columns that will go through each transformation
# Combine preprocessing steps
preprocessor = make_column_transformer(
(StandardScaler(), numeric_features),
(OneHotEncoder(handle_unknown='ignore'), categorical_features))
algo = GradientBoostingRegressor()
# Create preprocessing and training pipeline
pipeline = make_pipeline( preprocessor, algo)
# fit the pipeline to train an algorithm to create a model on the training set
model = evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo)
return model
###########################################################################################################################
################### Call the above functions
###########################################################################################################################
not_features_cols = ['instant', 'dteday', 'yr', 'day', 'rentals']
features_cols = ['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed']
model = createPipelineAndEvaluate(bike_data, not_features_cols, features_cols,)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment