Skip to content

Instantly share code, notes, and snippets.

View charanhu's full-sized avatar

Charan H U charanhu

View GitHub Profile
#so are we are imputing zero for missing values in target feature:
train_df['totals.transactionRevenue'].fillna(0,inplace=True)
train_df['device.isMobile'] = train_df['device.isMobile'].astype(bool)
test_df['device.isMobile'] = test_df['device.isMobile'].astype(bool)
numeric_feat = ['visitNumber',\
'visitStartTime',\
'totals.hits',\
'totals.pageviews',\
'totals.timeOnSite',\
'totals.transactions',\
'totals.transactionRevenue']
for col in numeric_feat:
train_df[col].fillna(0,inplace=True)
# source : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
categorical_feat = ['channelGrouping',\
'device.browser',\
'device.operatingSystem',\
'device.deviceCategory',\
'geoNetwork.continent',\
'geoNetwork.subContinent',\
'geoNetwork.country',\
'geoNetwork.region',\
def get_time_series_features(data,k):
train_frame_k = data.loc[(data['date'] >= min(data['date']) + timedelta(days=168*(k-1))) & (data['date'] < min(data['date']) + timedelta(days=168*k)) ]
# for train_frame_1 = aug 1st 2016 to jan 15 th 2017
#in code k=1 and min(data['date']) = aug 1st 2016
# so it return from aug 1st 2016 to jan 15 th 2017 (since: aug 1st 2016+168)
test_frame_k = data.loc[(data['date'] >= max(train_frame_k['date']) + timedelta(days=46)) & (data['date'] <= max(train_frame_k['date']) + timedelta(days=108))]
train_test_data = pd.concat([train_df, test_df], axis=0).reset_index()
%time train_frame_1 = get_time_series_features(train_test_data,1)
train_frame_1.to_pickle('train_frame_1')
%time train_frame_2 = get_time_series_features(train_test_data,2)
train_frame_2.to_pickle('train_frame_2')
%time train_frame_3 = get_time_series_features(train_test_data,3)
train_frame_3.to_pickle('train_frame_3')
gridParams = {
'learning_rate': [0.005,0.01,0.015], #Learning rate
'n_estimators': [40,100,200], #number of boosting iterations
'num_leaves': [6,8,12,15,16], #number of leaves in full tree
'boosting_type' : ['gbdt'],
'objective' : ['binary'], #Binary Classification model to predict whether customer will return during test window
'metric' : ['binary_logloss'], #Performance metric as "Binary Logloss"
'colsample_bytree' : [0.6, 0.8, 1], #LightGBM will select 80% of features before training each tree
'subsample' : [0.7,0.9, 1], #this will randomly select part of data without resampling
'reg_alpha' : [0,1], #L1 regularization
#Running Lightgbm model for 10 iterations and took average of those.
#Source :- https://www.kaggle.com/kostoglot/winning-solution
pr_lgb_sum = 0 #Variable to store predictions.
print('Training and predictions')
for i in range(10): #Running the model for 10 iterations and would be taking average of those as final value.
print('Interation number ', i)
gridParams = {
"n_estimators":[200,400,600,800,1000],
"max_depth": [2,5,7,9,10],
"min_samples_split":[2,3,5,7],
"min_samples_leaf": [1,2,4]
gridParams = {
'learning_rate': [0.005,0.01,0.015], #Learning rate
'n_estimators': [40,100,200], #number of boosting iterations
'num_leaves': [6,8,12,15,16], #number of leaves in full tree
'boosting_type' : ['gbdt'],
'objective' : ['binary'], #Binary Classification model to predict whether customer will return during test window
'metric' : ['binary_logloss'], #Performance metric as "Binary Logloss"
'colsample_bytree' : [0.6, 0.8, 1], #LightGBM will select 80% of features before training each tree
'subsample' : [0.7,0.9, 1], #this will randomly select part of data without resampling
'reg_alpha' : [0,1], #L1 regularization