devanshuDesai · November 9, 2019 04:19
diff --git a/pipeline_dodgers.py b/pipeline_dodgers.py
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import FunctionTransformer
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 from sklearn.model_selection import GridSearchCV
 from sklearn.model_selection import StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.decomposition import PCA
 from sklearn.linear_model import LogisticRegression

 import os
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 import warnings

 warnings.filterwarnings("ignore")
 plt.style.use('ggplot')

 bike = pd.read_csv('bikeshare.csv')

 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

 def transform_time(X):
    """Convert Datetime objects to seconds for numerical/quantitative parsing"""
    df = pd.DataFrame(X)
    return df.apply(lambda x: pd.to_datetime(x).apply(lambda x: x.timestamp()))

 def as_is(df):
    """Returns the dataframe without any modifications"""
    return df
    
 """
 Handling Categorical Data - Cardinal / Ordinal
 """
 col_base = ColumnTransformer([('convert_time', FunctionTransformer(transform_time, 
                                                                   validate=False), 
                               ['starttime']),
                              ('ohe', OneHotEncoder(), ['usertype', 'gender']), 
                              ('as_is', FunctionTransformer(as_is, validate=True), 
                               ['start station latitude', 'start station longitude', 
                               'end station latitude', 'end station longitude', 
                                'birth year', 'bikeid'])], 
                             remainder='drop')
                             
 base_pipe = Pipeline([('transform', col_base), ('model', LogisticRegression())])
 base_pipe.fit(X_train, y_train);

 """
 Feature Engineering
 """

 def get_time_features(vals):
    """Takes a date and returns day of week, month, hour and 
    whether it is a weekday/workhour"""
    ser = vals.iloc[:,0]
    ser = pd.to_datetime(ser)
    dow = ser.dt.dayofweek
    month = ser.dt.month
    hour = ser.dt.hour
    weekday = dow.apply(lambda x: 1 if x < 5 else 0)
    workhour = hour.apply(lambda x: 1 if x in [8,17] else 0)
    out = pd.concat([dow, month, hour, weekday], axis=1)
    return out

 def get_hav_distance(coords):
    """Takes a set of coordinates and returns the Haversine distance between the two points"""
    coords = pd.DataFrame(coords)
    s_lat = coords.iloc[:,0]
    s_lng = coords.iloc[:,1]
    e_lat = coords.iloc[:,2]
    e_lng = coords.iloc[:,3]
    # approximate radius of earth in km
    R = 6373.0
    s_lat = s_lat*np.pi/180.0                      
    s_lng = np.deg2rad(s_lng)     
    e_lat = np.deg2rad(e_lat)                       
    e_lng = np.deg2rad(e_lng)  
    
    d = np.sin((e_lat - s_lat)/2)**2 
    + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2
    out = 2 * R * np.arcsin(np.sqrt(d))
    return pd.DataFrame(out)

 def get_age(vals):
    """Converts Birth Year to Age in 2019"""
    out = 2019 - pd.DataFrame(vals)
    return out

 # Calculates Haversine Distance and Standardize
 dist = Pipeline([('calc_dist', FunctionTransformer(get_hav_distance)),
                ('standardize', StandardScaler())])

 # Perform Different Feature Engineering based on our rules
 col = ColumnTransformer([('convert_date', 
        FunctionTransformer(get_time_features, validate=False), ['starttime']),
        ('ohe', OneHotEncoder(), 
         ['usertype', 'gender']),
        ('distance', dist, 
         ['start station latitude', 'start station longitude',
        'end station latitude', 'end station longitude']),
        ('convert_age', FunctionTransformer(get_age), 
         ['birth year'])])

 """
 Using PCA to reduce dimensionality and produce more generalizable model
 """
 reduce = Pipeline([('transform', col), 
            ('pca', PCA(n_components = 'mle', svd_solver = 'full'))])
        
        
 """
 Using 3-fold cross validation to find best parameters for logistic regression
 """
 lg_param_grid = {'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 3),
              'solver' : ['liblinear']}

 kfold = StratifiedKFold(n_splits=3)

 lr = GridSearchCV(LogisticRegression(),
                     param_grid = lg_param_grid, 
                     scoring="accuracy", cv=kfold,
                     n_jobs=4, verbose=1)

 lr.fit(reduce.fit_transform(X_train), y_train)

 clf_best = lr.best_estimator_
 print(clf_best)
           
 # Final wrapper for all transformations + best model
 final_pipe = Pipeline([('reduce', reduce), ('model', clf_best)])
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import FunctionTransformer
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.preprocessing import StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.model_selection import GridSearchCV
	from sklearn.model_selection import StratifiedKFold
	from sklearn.pipeline import Pipeline
	from sklearn.decomposition import PCA
	from sklearn.linear_model import LogisticRegression

	import os
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import warnings

	warnings.filterwarnings("ignore")
	plt.style.use('ggplot')

	bike = pd.read_csv('bikeshare.csv')

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

	def transform_time(X):
	"""Convert Datetime objects to seconds for numerical/quantitative parsing"""
	df = pd.DataFrame(X)
	return df.apply(lambda x: pd.to_datetime(x).apply(lambda x: x.timestamp()))

	def as_is(df):
	"""Returns the dataframe without any modifications"""
	return df

	"""
	Handling Categorical Data - Cardinal / Ordinal
	"""
	col_base = ColumnTransformer([('convert_time', FunctionTransformer(transform_time,
	validate=False),
	['starttime']),
	('ohe', OneHotEncoder(), ['usertype', 'gender']),
	('as_is', FunctionTransformer(as_is, validate=True),
	['start station latitude', 'start station longitude',
	'end station latitude', 'end station longitude',
	'birth year', 'bikeid'])],
	remainder='drop')

	base_pipe = Pipeline([('transform', col_base), ('model', LogisticRegression())])
	base_pipe.fit(X_train, y_train);

	"""
	Feature Engineering
	"""

	def get_time_features(vals):
	"""Takes a date and returns day of week, month, hour and
	whether it is a weekday/workhour"""
	ser = vals.iloc[:,0]
	ser = pd.to_datetime(ser)
	dow = ser.dt.dayofweek
	month = ser.dt.month
	hour = ser.dt.hour
	weekday = dow.apply(lambda x: 1 if x < 5 else 0)
	workhour = hour.apply(lambda x: 1 if x in [8,17] else 0)
	out = pd.concat([dow, month, hour, weekday], axis=1)
	return out

	def get_hav_distance(coords):
	"""Takes a set of coordinates and returns the Haversine distance between the two points"""
	coords = pd.DataFrame(coords)
	s_lat = coords.iloc[:,0]
	s_lng = coords.iloc[:,1]
	e_lat = coords.iloc[:,2]
	e_lng = coords.iloc[:,3]
	# approximate radius of earth in km
	R = 6373.0
	s_lat = s_lat*np.pi/180.0
	s_lng = np.deg2rad(s_lng)
	e_lat = np.deg2rad(e_lat)
	e_lng = np.deg2rad(e_lng)

	d = np.sin((e_lat - s_lat)/2)**2
	+ np.cos(s_lat)np.cos(e_lat) np.sin((e_lng - s_lng)/2)**2
	out = 2 * R * np.arcsin(np.sqrt(d))
	return pd.DataFrame(out)

	def get_age(vals):
	"""Converts Birth Year to Age in 2019"""
	out = 2019 - pd.DataFrame(vals)
	return out

	# Calculates Haversine Distance and Standardize
	dist = Pipeline([('calc_dist', FunctionTransformer(get_hav_distance)),
	('standardize', StandardScaler())])

	# Perform Different Feature Engineering based on our rules
	col = ColumnTransformer([('convert_date',
	FunctionTransformer(get_time_features, validate=False), ['starttime']),
	('ohe', OneHotEncoder(),
	['usertype', 'gender']),
	('distance', dist,
	['start station latitude', 'start station longitude',
	'end station latitude', 'end station longitude']),
	('convert_age', FunctionTransformer(get_age),
	['birth year'])])

	"""
	Using PCA to reduce dimensionality and produce more generalizable model
	"""
	reduce = Pipeline([('transform', col),
	('pca', PCA(n_components = 'mle', svd_solver = 'full'))])


	"""
	Using 3-fold cross validation to find best parameters for logistic regression
	"""
	lg_param_grid = {'penalty' : ['l1', 'l2'],
	'C' : np.logspace(-4, 4, 3),
	'solver' : ['liblinear']}

	kfold = StratifiedKFold(n_splits=3)

	lr = GridSearchCV(LogisticRegression(),
	param_grid = lg_param_grid,
	scoring="accuracy", cv=kfold,
	n_jobs=4, verbose=1)

	lr.fit(reduce.fit_transform(X_train), y_train)

	clf_best = lr.best_estimator_
	print(clf_best)

	# Final wrapper for all transformations + best model
	final_pipe = Pipeline([('reduce', reduce), ('model', clf_best)])