Created
March 15, 2021 10:04
-
-
Save PranjalDureja0002/3ca91ba0c335064b5e25e8cbcb01aee0 to your computer and use it in GitHub Desktop.
model
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#In the 80% train set, split the train set into d1 and d2.(50-50). | |
d1,d2,y1,y2 = train_test_split(X_train,y_train,stratify=y_train,test_size=0.5,random_state=15) | |
d1 = d1.reset_index(drop=True) | |
d2 = d2.reset_index(drop=True) | |
y1 = y1.reset_index(drop=True) | |
y2 = y2.reset_index(drop=True) | |
def generating_samples(d1, y1): | |
"""From this d1,sampling with replacement is done | |
""" | |
index = np.random.choice(d1.shape[0], int(0.6*d1.shape[0]), replace=False) | |
x_random = d1.iloc[index] | |
y_random = y1[index] | |
index_dup = np.random.choice(x_random.shape[0], 89314,replace=True) | |
x_random_dup = x_random.iloc[index_dup] | |
y_random_dup = y_random.iloc[index_dup] | |
y_random = np.resize(y_random, (y_random.shape[0], 1)) | |
y_random_dup = np.resize(y_random_dup, (y_random_dup.shape[0], 1)) | |
final_sample_data=np.vstack((x_random,x_random_dup)) | |
final_target_data=np.vstack((y_random,y_random_dup)) | |
return final_sample_data,final_target_data | |
def sample_gen(x,y,n): | |
""" to create d1,d2,d3....dn(n samples) | |
""" | |
list_input_data =[] | |
list_output_data =[] | |
for i in range(0,n): | |
a,b=generating_samples(x, y) | |
list_input_data.append(a) | |
list_output_data.append(b) | |
return list_input_data,list_output_data | |
def model_gen(list_input_data,list_output_data,n): | |
""" create 'n' models and train each of these models with each of these n samples. | |
""" | |
aggr = list() | |
for i in range(0,n): | |
sample = list_input_data | |
target = list_output_data | |
model = DecisionTreeClassifier() | |
model.fit(sample[i],target[i]) | |
aggr.append(model) | |
return aggr | |
def pred_fun(x,aggr): | |
""" Now pass the d2 set to each of these n models, | |
now you will get n predictions for d2, from each of these models. | |
""" | |
pred_list=[] | |
for m_i in range(len(aggr)): | |
y_pred=aggr[m_i].predict(x) | |
pred_list.append(y_pred) | |
return pred_list | |
def datafr(aggr,pred_list): | |
""" Now using these n predictions creating a new dataset, | |
""" | |
meta_df = pd.DataFrame() | |
for i in range(len(aggr)): | |
meta_df['model'+str(i)]= pred_list[i] | |
return meta_df | |
def custom_fun(x,y,n): | |
list_input_data,list_output_data = sample_gen(d1,y1,n) | |
aggr = model_gen(list_input_data,list_output_data,n) | |
pred_list = pred_fun(d2,aggr) | |
meta_df = datafr(aggr,pred_list) | |
#For D2,we already know it's corresponding target values, so now we train a meta model with these n predictions. | |
xg_model= XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=10,objective='binary:logistic',silent=True, nthread=4) | |
xg_model.fit(meta_df,y2) | |
test_list = pred_fun(X_test,aggr) #Passing the test set to each of the base models and we will get 'n' predictions | |
meta_test = datafr(aggr,test_list)#Create a new dataset with these 'n' predictions and passing it to the metamodel to get the final prediction | |
y_pred = xg_model.predict(meta_test) | |
return y_pred | |
#Paramter tuning for the number of base models | |
estimators = [50,100,150] | |
for i in estimators: | |
y_pred = custom_fun(X_train,y_train,i) | |
print('Testing score for ', i, "base models is:",f1_score(y_test,y_pred, average='macro')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment