Skip to content

Instantly share code, notes, and snippets.

View gvyshnya's full-sized avatar

George Vyshnya gvyshnya

View GitHub Profile
#!/usr/bin/python
import pandas as pd
import numpy as np
import fbprophet as fbpro
import sklearn.metrics as skm
import math
import datetime as dt
class ProphetModeller(object):
@gvyshnya
gvyshnya / preprocessing.py
Created December 8, 2017 11:26
Basic pre-processing and feature engineering script for Recruit Restaurant Visitor Forecasting contest
# Project/Contest: Recruit Restaurant Visitor Forecasting (https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting)
#
# Summary: this is a basic pre-processing and feature engineering script to transform original input data from the Customer
# into a ready-for modelling training and testing sets
#
# inspirations:
# - https://www.kaggle.com/the1owl/surprise-me/
import numpy as np
import pandas as pd
@gvyshnya
gvyshnya / sparsity.py
Created December 17, 2017 09:28
The code to test the sparsity of your data input
def sparsity_ratio(X):
return 1.0 - np.count_nonzero(X) / float(X.shape[0] * X.shape[1])
print("input sparsity ratio:", sparsity_ratio(X))
@gvyshnya
gvyshnya / gbm_model_sparse.py
Created December 17, 2017 09:33
gbm prediction model, sparse data input
# GBM prediction
import numpy as np
import pandas as pd
from sklearn import *
import datetime as dt
def RMSLE(y, pred):
return metrics.mean_squared_error(y, pred) ** 0.5
@gvyshnya
gvyshnya / gbm_model_dense.py
Created December 17, 2017 09:36
GBM prediction model, dense input
# GBM prediction
import numpy as np
import pandas as pd
from sklearn import *
import datetime as dt
def RMSLE(y, pred):
return metrics.mean_squared_error(y, pred) ** 0.5
import numpy as np
import pandas as pd
import pdpipe as pdp
# ... data reading code goes here
# set up a transformation pipeline
pipeline_1 = pdp.ApplyByCols(
['lat', 'lon', 'lat_inspection_location', 'lon_inspection_location'],
lambda col: pd.to_numeric(col)
@gvyshnya
gvyshnya / Datetime-related feature engineering
Created July 7, 2020 08:25
Demo on how to do Datetime-related and trend/lag feature engineering (with COVID-19 pandemic data as a case study)
from sklearn.feature_selection import RFE
# Define dictionary to store our rankings
ranks = {}
# Create our function which stores the feature rankings to the ranks dictionary
def ranking(ranks, names, order=1):
minmax = MinMaxScaler()
ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
ranks = map(lambda x: round(x,2), ranks)
return dict(zip(names, ranks))
from sklearn.feature_selection import SelectFromModel
embeded_rf_selector = SelectFromModel(modeller, max_features=200)
embeded_rf_selector.fit(X, y)
embeded_rf_support = embeded_rf_selector.get_support()
embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
from sklearn.inspection import permutation_importance
# Here's how you use permutation importance
def get_permutation_importance(X, y, model) -> pd.DataFrame:
result = permutation_importance(model, X, y, n_repeats=1,
random_state=0)
# permutational importance results
result_df = pd.DataFrame(colnames, columns=['Feature'])
result_df['permutation_importance'] = result.get('importances')