sjtalkar · November 16, 2022 15:09
diff --git a/gistfile1.txt b/gistfile1.txt
 import altair as alt

 #Simple Altait plot
 import altair as alt
 def plotGenericLineChart(x_values, x_type , y_values, y_type, chart_title, x_axis_title, y_axis_title):
    """
        This function creates a line chart for two values X and Y passed in along with thier types and titles
    """
    df = pd.DataFrame({"x_values" : x_values, 'y_values': y_values})
    chart = alt.Chart(df).mark_line(

    ).encode(
        x = alt.X(f"x_values:{x_type}", title = x_axis_title),
        y = alt.Y(f"y_values:{y_type}", title = y_axis_title),
    
    ).properties(width=600, height=400)
      

    return chart




 def plotDistributionForColumn(df, colName):

    """
        This function plots a histogram for a column of the df  dataframe 
        At the botton of the histogram, it plots a boxplot
    """
    source = df[[colName]]

    base = alt.Chart(source, title=colName)

    B = base.mark_bar(color='green').encode(
        alt.X(f"{colName}:Q", bin=alt.Bin(maxbins=100)),
        y='count()',
    )


    mean_rule = alt.Chart(source).mark_rule(color='red').encode(
        x=f'mean({colName}):Q'
    )

    median_rule = alt.Chart(source).mark_rule(color='orange').encode(
        x=f'median({colName}):Q'
    )

    box_plot = base.mark_boxplot().encode(
        x=f'{colName}:Q',

    )

    return B + mean_rule + median_rule & box_plot
    
    
    # Call the function in a cell using IPython display to chart all the columns at the same time
    from IPython.display import display
    
    for col in numeric_features + ['rentals']:
      chart = plotDistributionForColumn(bike_data, col)
      display(chart)
      
      
  def plotCategoricalfeatures(df, colName):

    """
        This function plots a br graph for  a column of the df  dataframe. 
        The type of the column is categorical 
        At the botton of the histogram, it plots a boxplot
    """
    source = df[[colName]]

    base = alt.Chart(source, title=colName)

    B = base.mark_bar(color='#20b2aa').encode(
        alt.X(f"{colName}:N"),
        y='count()',
    )
    
    return B
    
    
    
    

 def createCorrelationScatters(df, colName, targetName):
    """
        This function created a scatter chart and a line that is a regression line between the two numerical columns passed to it
        It prints out the correlation values as well
    """


    corr = df[targetName].corr(df[colName])
    source = df

    base = alt.Chart(source)

    chart = base.mark_circle().encode(
        alt.X(f"{colName}:Q"),
        alt.Y(f"{targetName}:Q"),
        color=alt.value("orange")
    ).properties(
        width=300,
        height=150
    )
    text = base.mark_text(
                            align="left", baseline="top"
                        ).encode(
                        x=alt.value(5),  # pixels from left
                        y=alt.value(5),  # pixels from top
                        text=alt.value(f"corr: {corr:.3f}"),
                        )

    return chart + text+ chart.transform_regression(colName, targetName).mark_line(color='darkblue').encode(color=alt.value('blue'))
    
    
    ### Create box plots of the target - facet by values of the categorical values

 def createBarsForTargetPerCategory(df, targetName, catColName ):
    source = bike_data
    base = alt.Chart(source)

    box_plot = base.mark_boxplot().encode(
            y=f'{targetName}:Q',

        ).facet(
            column=f'{catColName}:N'
        )

    return box_plot    

    targetName = 'rentals'
 for catColName in categorical_features:
   chart  = createBarsForTargetPerCategory(bike_data, targetName, catColName)
   display(chart)




 def compareTrueAndPredictedLabels(y_true, y_pred, degree = 1):
    """
        This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a continuous variable
        Note that -
        R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line. This is based on the total variation of prediction explained by the model.
    """

    data = {'y_true':y_test, 'y_pred':y_pred}
    df = pd.DataFrame(data=data)

    # Evaluate the model using the test data
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    FIG_WIDTH = 500
    FIG_HEIGHT = 400

    base = alt.Chart(
                    df,   title = 'Daily Bike Share Predictions'
                    ).properties(
                        height = FIG_HEIGHT,
                        width = FIG_WIDTH
                    )

    s = base.mark_circle(
        ).encode(
            x = alt.X('y_true:Q',title = 'Actual Labels'),
            y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
            color = alt.value('red')
        
        )

    # create a linear relation (polyfit with degree 1) betwen true and predicted values
    z = np.polyfit(y_test, y_pred, 1)
    p = np.poly1d(z)

    reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})

    l = alt.Chart(reg_line).mark_line(  
        ).encode(
            x = 'y_true:Q',
            y = 'y_eval:Q'
        )

    text = base.mark_text(
                            align="right", baseline="bottom"
                            ).encode(
                            x=alt.value(FIG_WIDTH -100),  # pixels from left
                            y=alt.value(FIG_HEIGHT-10),  # pixels from top
                            text=alt.value(f"corr: {mse:.3f}   rmse: {rmse:0.3f}   r2: {r2:0.3f}  "),
                            )

    return s + l + text
    
    
    
    ###################################################################################################################################
    ##################Assessing models for baseline####################################################################################
    
    def compareTrueAndPredictedLabels(y_true, y_pred, chart_title, degree = 1):
    """
        This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a 
        continuous variable.
        Note that -
        R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line.
        This is based on the total variation of prediction explained by the model.
        
        Inputs: y_true - the Laveled y test values
                y_pred - predicted values for (baseline?) model
                chart_title - a LIST of strings that is the (baseline?) model's parameters
    """

    data = {'y_true':y_test, 'y_pred':y_pred}
    df = pd.DataFrame(data=data)

    # Evaluate the model using the test data
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    FIG_WIDTH = 300
    FIG_HEIGHT = 200

    base = alt.Chart(
                    df,   title = chart_title
                    ).properties(
                        height = FIG_HEIGHT,
                        width = FIG_WIDTH
                    )

    s = base.mark_circle(
        ).encode(
            x = alt.X('y_true:Q',title = 'Actual Labels'),
            y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
            color = alt.value('red')
        
        )

    # create a linear relation (polyfit with degree 1) betwen true and predicted values
    z = np.polyfit(y_test, y_pred, 1)
    p = np.poly1d(z)

    reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})

    l = alt.Chart(reg_line).mark_line(  
        ).encode(
            x = 'y_true:Q',
            y = 'y_eval:Q'
        )

    text = base.mark_text(
                            align="right", baseline="bottom", lineBreak='\n'
                            ).encode(
                            x=alt.value(FIG_WIDTH -100),  # pixels from left
                            y=alt.value(FIG_HEIGHT-30),  # pixels from top
                            text=alt.value(f"corr: {mse:.3f}\n   rmse: {rmse:0.3f}\n   r2: {r2:0.3f}\n  "),
                            
                            )

    return s + l + text
    


 # Find scores for a vriety of models with default parameters

 from sklearn.linear_model import LinearRegression
 from sklearn.linear_model import Lasso
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.dummy import DummyRegressor


 def evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test):
    """
        reg_algos  - a list of regression algortithms sent in for prediction and evaluation of the model 

    """
    for algo in reg_algos:
        model = algo.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))
        display(chart.configure_title(fontSize=8))


 #### Create and compare models with default parameters
 reg_algos = []

 reg_algos.append(DummyRegressor())
 reg_algos.append(LinearRegression())
 reg_algos.append(DecisionTreeRegressor())


 evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test)



 ################################################################################################################################
 ##################USING PIPELINE TO CREATE COLUMN TRANSFORMATION AND THEN RUN A REGRESSION MODEL################################

 def evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo):
    """
        pipeline is a pipeline of transformations, imputations and the algorith itself that can be fit on the training set.
        algo is the algorith in the pipeline

    """
    model = pipeline.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))

    display(chart.configure_title(fontSize=8))
    return model
    
    
 from sklearn.compose import make_column_transformer
 from sklearn.pipeline import make_pipeline



 def createPipelineAndEvaluate(df, not_features_cols, features_cols,):
    """"
        This function identifies the numeric (continuous values) columns and discrete value columns 
        It then transofrms teh columns using StatndardScaler and OneHotEncoding respectively

        Then a pipeline is created using make_pipeline to send the transofrmed dataset to a ML algorithm 
    """

    numeric_columns = df.select_dtypes('float').columns.values
    categorical_columns = df.select_dtypes('int').columns.values
    # remove the columns that are not features from the above lists
    categorical_columns = np.setdiff1d(categorical_columns,  np.array(not_features_cols))
    numeric_columns = np.setdiff1d(numeric_columns,  np.array(not_features_cols))

    # These are the columns of our training array
    X_cols = features_cols

    # find the numerical indexes of the column in the ndarray corresponding to floating columns that have to be scaled and the categorical columns that have to be one hot encoded
    num_col_indexes = [i for num_col in numeric_columns for i,x in enumerate(X_cols) if x == num_col]
    cat_col_indexes = [i for num_col in categorical_columns for i,x in enumerate(X_cols) if x == num_col]

    #define the scaling for the numeric columns
    # specify the columns that will go through each transformation
    # Combine preprocessing steps
    preprocessor = make_column_transformer(
                (StandardScaler(), numeric_features),
                (OneHotEncoder(handle_unknown='ignore'), categorical_features))


    algo = GradientBoostingRegressor()
    # Create preprocessing and training pipeline
    pipeline = make_pipeline( preprocessor, algo)


    # fit the pipeline to train an algorithm to create a  model on the training set 
    model = evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo)
    return model
    
    
    ###########################################################################################################################
    ################### Call the above functions
    ###########################################################################################################################
    
 not_features_cols = ['instant', 'dteday', 'yr', 'day', 'rentals']
 features_cols = ['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed']

 model = createPipelineAndEvaluate(bike_data, not_features_cols, features_cols,)
	import altair as alt

	#Simple Altait plot
	import altair as alt
	def plotGenericLineChart(x_values, x_type , y_values, y_type, chart_title, x_axis_title, y_axis_title):
	"""
	This function creates a line chart for two values X and Y passed in along with thier types and titles
	"""
	df = pd.DataFrame({"x_values" : x_values, 'y_values': y_values})
	chart = alt.Chart(df).mark_line(

	).encode(
	x = alt.X(f"x_values:{x_type}", title = x_axis_title),
	y = alt.Y(f"y_values:{y_type}", title = y_axis_title),

	).properties(width=600, height=400)


	return chart




	def plotDistributionForColumn(df, colName):

	"""
	This function plots a histogram for a column of the df dataframe
	At the botton of the histogram, it plots a boxplot
	"""
	source = df[[colName]]

	base = alt.Chart(source, title=colName)

	B = base.mark_bar(color='green').encode(
	alt.X(f"{colName}:Q", bin=alt.Bin(maxbins=100)),
	y='count()',
	)


	mean_rule = alt.Chart(source).mark_rule(color='red').encode(
	x=f'mean({colName}):Q'
	)

	median_rule = alt.Chart(source).mark_rule(color='orange').encode(
	x=f'median({colName}):Q'
	)

	box_plot = base.mark_boxplot().encode(
	x=f'{colName}:Q',

	)

	return B + mean_rule + median_rule & box_plot


	# Call the function in a cell using IPython display to chart all the columns at the same time
	from IPython.display import display

	for col in numeric_features + ['rentals']:
	chart = plotDistributionForColumn(bike_data, col)
	display(chart)


	def plotCategoricalfeatures(df, colName):

	"""
	This function plots a br graph for a column of the df dataframe.
	The type of the column is categorical
	At the botton of the histogram, it plots a boxplot
	"""
	source = df[[colName]]

	base = alt.Chart(source, title=colName)

	B = base.mark_bar(color='#20b2aa').encode(
	alt.X(f"{colName}:N"),
	y='count()',
	)

	return B





	def createCorrelationScatters(df, colName, targetName):
	"""
	This function created a scatter chart and a line that is a regression line between the two numerical columns passed to it
	It prints out the correlation values as well
	"""


	corr = df[targetName].corr(df[colName])
	source = df

	base = alt.Chart(source)

	chart = base.mark_circle().encode(
	alt.X(f"{colName}:Q"),
	alt.Y(f"{targetName}:Q"),
	color=alt.value("orange")
	).properties(
	width=300,
	height=150
	)
	text = base.mark_text(
	align="left", baseline="top"
	).encode(
	x=alt.value(5), # pixels from left
	y=alt.value(5), # pixels from top
	text=alt.value(f"corr: {corr:.3f}"),
	)

	return chart + text+ chart.transform_regression(colName, targetName).mark_line(color='darkblue').encode(color=alt.value('blue'))


	### Create box plots of the target - facet by values of the categorical values

	def createBarsForTargetPerCategory(df, targetName, catColName ):
	source = bike_data
	base = alt.Chart(source)

	box_plot = base.mark_boxplot().encode(
	y=f'{targetName}:Q',

	).facet(
	column=f'{catColName}:N'
	)

	return box_plot

	targetName = 'rentals'
	for catColName in categorical_features:
	chart = createBarsForTargetPerCategory(bike_data, targetName, catColName)
	display(chart)




	def compareTrueAndPredictedLabels(y_true, y_pred, degree = 1):
	"""
	This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a continuous variable
	Note that -
	R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line. This is based on the total variation of prediction explained by the model.
	"""

	data = {'y_true':y_test, 'y_pred':y_pred}
	df = pd.DataFrame(data=data)

	# Evaluate the model using the test data
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)

	FIG_WIDTH = 500
	FIG_HEIGHT = 400

	base = alt.Chart(
	df, title = 'Daily Bike Share Predictions'
	).properties(
	height = FIG_HEIGHT,
	width = FIG_WIDTH
	)

	s = base.mark_circle(
	).encode(
	x = alt.X('y_true:Q',title = 'Actual Labels'),
	y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
	color = alt.value('red')

	)

	# create a linear relation (polyfit with degree 1) betwen true and predicted values
	z = np.polyfit(y_test, y_pred, 1)
	p = np.poly1d(z)

	reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})

	l = alt.Chart(reg_line).mark_line(
	).encode(
	x = 'y_true:Q',
	y = 'y_eval:Q'
	)

	text = base.mark_text(
	align="right", baseline="bottom"
	).encode(
	x=alt.value(FIG_WIDTH -100), # pixels from left
	y=alt.value(FIG_HEIGHT-10), # pixels from top
	text=alt.value(f"corr: {mse:.3f} rmse: {rmse:0.3f} r2: {r2:0.3f} "),
	)

	return s + l + text



	###################################################################################################################################
	##################Assessing models for baseline####################################################################################

	def compareTrueAndPredictedLabels(y_true, y_pred, chart_title, degree = 1):
	"""
	This function plots a graph (linear - degree 1) that shows the correlation between the true and predicted values of a
	continuous variable.
	Note that -
	R² gives us a measure of how well the actual outcomes are replicated by the model or the regression line.
	This is based on the total variation of prediction explained by the model.

	Inputs: y_true - the Laveled y test values
	y_pred - predicted values for (baseline?) model
	chart_title - a LIST of strings that is the (baseline?) model's parameters
	"""

	data = {'y_true':y_test, 'y_pred':y_pred}
	df = pd.DataFrame(data=data)

	# Evaluate the model using the test data
	mse = mean_squared_error(y_test, y_pred)
	rmse = np.sqrt(mse)
	r2 = r2_score(y_test, y_pred)

	FIG_WIDTH = 300
	FIG_HEIGHT = 200

	base = alt.Chart(
	df, title = chart_title
	).properties(
	height = FIG_HEIGHT,
	width = FIG_WIDTH
	)

	s = base.mark_circle(
	).encode(
	x = alt.X('y_true:Q',title = 'Actual Labels'),
	y = alt.Y('y_pred:Q', title = 'Predicted Labels'),
	color = alt.value('red')

	)

	# create a linear relation (polyfit with degree 1) betwen true and predicted values
	z = np.polyfit(y_test, y_pred, 1)
	p = np.poly1d(z)

	reg_line = pd.DataFrame({'y_true':y_test, 'y_eval':p(y_test)})

	l = alt.Chart(reg_line).mark_line(
	).encode(
	x = 'y_true:Q',
	y = 'y_eval:Q'
	)

	text = base.mark_text(
	align="right", baseline="bottom", lineBreak='\n'
	).encode(
	x=alt.value(FIG_WIDTH -100), # pixels from left
	y=alt.value(FIG_HEIGHT-30), # pixels from top
	text=alt.value(f"corr: {mse:.3f}\n rmse: {rmse:0.3f}\n r2: {r2:0.3f}\n "),

	)

	return s + l + text



	# Find scores for a vriety of models with default parameters

	from sklearn.linear_model import LinearRegression
	from sklearn.linear_model import Lasso
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.ensemble import GradientBoostingRegressor
	from sklearn.dummy import DummyRegressor


	def evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test):
	"""
	reg_algos - a list of regression algortithms sent in for prediction and evaluation of the model

	"""
	for algo in reg_algos:
	model = algo.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))
	display(chart.configure_title(fontSize=8))


	#### Create and compare models with default parameters
	reg_algos = []

	reg_algos.append(DummyRegressor())
	reg_algos.append(LinearRegression())
	reg_algos.append(DecisionTreeRegressor())


	evaluateBaseModel(reg_algos, X_train, X_test, y_train, y_test)



	################################################################################################################################
	##################USING PIPELINE TO CREATE COLUMN TRANSFORMATION AND THEN RUN A REGRESSION MODEL################################

	def evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo):
	"""
	pipeline is a pipeline of transformations, imputations and the algorith itself that can be fit on the training set.
	algo is the algorith in the pipeline

	"""
	model = pipeline.fit(X_train, y_train)
	y_pred = model.predict(X_test)
	chart = compareTrueAndPredictedLabels(y_test, y_pred, str(algo).split(','))

	display(chart.configure_title(fontSize=8))
	return model


	from sklearn.compose import make_column_transformer
	from sklearn.pipeline import make_pipeline



	def createPipelineAndEvaluate(df, not_features_cols, features_cols,):
	""""
	This function identifies the numeric (continuous values) columns and discrete value columns
	It then transofrms teh columns using StatndardScaler and OneHotEncoding respectively

	Then a pipeline is created using make_pipeline to send the transofrmed dataset to a ML algorithm
	"""

	numeric_columns = df.select_dtypes('float').columns.values
	categorical_columns = df.select_dtypes('int').columns.values
	# remove the columns that are not features from the above lists
	categorical_columns = np.setdiff1d(categorical_columns, np.array(not_features_cols))
	numeric_columns = np.setdiff1d(numeric_columns, np.array(not_features_cols))

	# These are the columns of our training array
	X_cols = features_cols

	# find the numerical indexes of the column in the ndarray corresponding to floating columns that have to be scaled and the categorical columns that have to be one hot encoded
	num_col_indexes = [i for num_col in numeric_columns for i,x in enumerate(X_cols) if x == num_col]
	cat_col_indexes = [i for num_col in categorical_columns for i,x in enumerate(X_cols) if x == num_col]

	#define the scaling for the numeric columns
	# specify the columns that will go through each transformation
	# Combine preprocessing steps
	preprocessor = make_column_transformer(
	(StandardScaler(), numeric_features),
	(OneHotEncoder(handle_unknown='ignore'), categorical_features))


	algo = GradientBoostingRegressor()
	# Create preprocessing and training pipeline
	pipeline = make_pipeline( preprocessor, algo)


	# fit the pipeline to train an algorithm to create a model on the training set
	model = evaluatePipelineModel(pipeline, X_train, X_test, y_train, y_test, algo)
	return model


	###########################################################################################################################
	################### Call the above functions
	###########################################################################################################################

	not_features_cols = ['instant', 'dteday', 'yr', 'day', 'rentals']
	features_cols = ['season','mnth', 'holiday','weekday','workingday','weathersit','temp', 'atemp', 'hum', 'windspeed']

	model = createPipelineAndEvaluate(bike_data, not_features_cols, features_cols,)