Skip to content

Instantly share code, notes, and snippets.

@PranjalDureja0002
Created March 15, 2021 10:04
Show Gist options
  • Save PranjalDureja0002/3ca91ba0c335064b5e25e8cbcb01aee0 to your computer and use it in GitHub Desktop.
Save PranjalDureja0002/3ca91ba0c335064b5e25e8cbcb01aee0 to your computer and use it in GitHub Desktop.
model
#In the 80% train set, split the train set into d1 and d2.(50-50).
d1,d2,y1,y2 = train_test_split(X_train,y_train,stratify=y_train,test_size=0.5,random_state=15)
d1 = d1.reset_index(drop=True)
d2 = d2.reset_index(drop=True)
y1 = y1.reset_index(drop=True)
y2 = y2.reset_index(drop=True)
def generating_samples(d1, y1):
"""From this d1,sampling with replacement is done
"""
index = np.random.choice(d1.shape[0], int(0.6*d1.shape[0]), replace=False)
x_random = d1.iloc[index]
y_random = y1[index]
index_dup = np.random.choice(x_random.shape[0], 89314,replace=True)
x_random_dup = x_random.iloc[index_dup]
y_random_dup = y_random.iloc[index_dup]
y_random = np.resize(y_random, (y_random.shape[0], 1))
y_random_dup = np.resize(y_random_dup, (y_random_dup.shape[0], 1))
final_sample_data=np.vstack((x_random,x_random_dup))
final_target_data=np.vstack((y_random,y_random_dup))
return final_sample_data,final_target_data
def sample_gen(x,y,n):
""" to create d1,d2,d3....dn(n samples)
"""
list_input_data =[]
list_output_data =[]
for i in range(0,n):
a,b=generating_samples(x, y)
list_input_data.append(a)
list_output_data.append(b)
return list_input_data,list_output_data
def model_gen(list_input_data,list_output_data,n):
""" create 'n' models and train each of these models with each of these n samples.
"""
aggr = list()
for i in range(0,n):
sample = list_input_data
target = list_output_data
model = DecisionTreeClassifier()
model.fit(sample[i],target[i])
aggr.append(model)
return aggr
def pred_fun(x,aggr):
""" Now pass the d2 set to each of these n models,
now you will get n predictions for d2, from each of these models.
"""
pred_list=[]
for m_i in range(len(aggr)):
y_pred=aggr[m_i].predict(x)
pred_list.append(y_pred)
return pred_list
def datafr(aggr,pred_list):
""" Now using these n predictions creating a new dataset,
"""
meta_df = pd.DataFrame()
for i in range(len(aggr)):
meta_df['model'+str(i)]= pred_list[i]
return meta_df
def custom_fun(x,y,n):
list_input_data,list_output_data = sample_gen(d1,y1,n)
aggr = model_gen(list_input_data,list_output_data,n)
pred_list = pred_fun(d2,aggr)
meta_df = datafr(aggr,pred_list)
#For D2,we already know it's corresponding target values, so now we train a meta model with these n predictions.
xg_model= XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=10,objective='binary:logistic',silent=True, nthread=4)
xg_model.fit(meta_df,y2)
test_list = pred_fun(X_test,aggr) #Passing the test set to each of the base models and we will get 'n' predictions
meta_test = datafr(aggr,test_list)#Create a new dataset with these 'n' predictions and passing it to the metamodel to get the final prediction
y_pred = xg_model.predict(meta_test)
return y_pred
#Paramter tuning for the number of base models
estimators = [50,100,150]
for i in estimators:
y_pred = custom_fun(X_train,y_train,i)
print('Testing score for ', i, "base models is:",f1_score(y_test,y_pred, average='macro'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment