PranjalDureja0002 · March 15, 2021 10:04
diff --git a/model.py b/model.py
 #In the 80% train set, split the train set into d1 and d2.(50-50).
 d1,d2,y1,y2 = train_test_split(X_train,y_train,stratify=y_train,test_size=0.5,random_state=15)
 d1 = d1.reset_index(drop=True)
 d2 = d2.reset_index(drop=True)
 y1 = y1.reset_index(drop=True)
 y2 = y2.reset_index(drop=True)

 def generating_samples(d1, y1):
    """From this d1,sampling with replacement is done
    """
    index = np.random.choice(d1.shape[0], int(0.6*d1.shape[0]), replace=False) 
    x_random = d1.iloc[index]
    y_random = y1[index]
    
    index_dup = np.random.choice(x_random.shape[0], 89314,replace=True) 
    x_random_dup = x_random.iloc[index_dup]  
    
    y_random_dup = y_random.iloc[index_dup]
    
    y_random = np.resize(y_random, (y_random.shape[0], 1))
    y_random_dup = np.resize(y_random_dup, (y_random_dup.shape[0], 1))
    final_sample_data=np.vstack((x_random,x_random_dup))
    final_target_data=np.vstack((y_random,y_random_dup))
    return final_sample_data,final_target_data
 def sample_gen(x,y,n):
    """ to create d1,d2,d3....dn(n samples)
    """
    list_input_data =[]
    list_output_data =[]
    
    for i in range(0,n):
        a,b=generating_samples(x, y)
        list_input_data.append(a)
        list_output_data.append(b)        
    return list_input_data,list_output_data
 def model_gen(list_input_data,list_output_data,n):
    """ create 'n' models and train each of these models with each of these n samples.
    """
    aggr = list()
    for i in range(0,n):
        sample = list_input_data
        target = list_output_data
        model = DecisionTreeClassifier()
        model.fit(sample[i],target[i])
        aggr.append(model)
    return aggr
 def pred_fun(x,aggr):
    """ Now pass the d2 set to each of these n models,
    now you will get n predictions for d2, from each of these models.
    """
    pred_list=[]
    for m_i in range(len(aggr)):        
        y_pred=aggr[m_i].predict(x)
        pred_list.append(y_pred)
    return pred_list
 def datafr(aggr,pred_list):
    """ Now using these n predictions creating a new dataset,
    """
    meta_df = pd.DataFrame()
    for i in range(len(aggr)):
        meta_df['model'+str(i)]= pred_list[i]
    return meta_df
 def custom_fun(x,y,n):    
    list_input_data,list_output_data = sample_gen(d1,y1,n)    
    aggr = model_gen(list_input_data,list_output_data,n)
    pred_list = pred_fun(d2,aggr)
    meta_df = datafr(aggr,pred_list)
    #For D2,we already know it's corresponding target values, so now we train a meta model with these n predictions.
    xg_model= XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=10,objective='binary:logistic',silent=True, nthread=4)
    xg_model.fit(meta_df,y2)
    test_list = pred_fun(X_test,aggr) #Passing the test set to each of the base models and we will get 'n' predictions
    meta_test = datafr(aggr,test_list)#Create a new dataset with these 'n' predictions and passing it to the metamodel to get the final prediction
    y_pred = xg_model.predict(meta_test)
    return y_pred
 #Paramter tuning for the number of base models
 estimators = [50,100,150]
 for i in estimators:
    y_pred = custom_fun(X_train,y_train,i)     
    print('Testing score for ', i, "base models is:",f1_score(y_test,y_pred, average='macro'))
	#In the 80% train set, split the train set into d1 and d2.(50-50).
	d1,d2,y1,y2 = train_test_split(X_train,y_train,stratify=y_train,test_size=0.5,random_state=15)
	d1 = d1.reset_index(drop=True)
	d2 = d2.reset_index(drop=True)
	y1 = y1.reset_index(drop=True)
	y2 = y2.reset_index(drop=True)

	def generating_samples(d1, y1):
	"""From this d1,sampling with replacement is done
	"""
	index = np.random.choice(d1.shape[0], int(0.6*d1.shape[0]), replace=False)
	x_random = d1.iloc[index]
	y_random = y1[index]

	index_dup = np.random.choice(x_random.shape[0], 89314,replace=True)
	x_random_dup = x_random.iloc[index_dup]

	y_random_dup = y_random.iloc[index_dup]

	y_random = np.resize(y_random, (y_random.shape[0], 1))
	y_random_dup = np.resize(y_random_dup, (y_random_dup.shape[0], 1))
	final_sample_data=np.vstack((x_random,x_random_dup))
	final_target_data=np.vstack((y_random,y_random_dup))
	return final_sample_data,final_target_data
	def sample_gen(x,y,n):
	""" to create d1,d2,d3....dn(n samples)
	"""
	list_input_data =[]
	list_output_data =[]

	for i in range(0,n):
	a,b=generating_samples(x, y)
	list_input_data.append(a)
	list_output_data.append(b)
	return list_input_data,list_output_data
	def model_gen(list_input_data,list_output_data,n):
	""" create 'n' models and train each of these models with each of these n samples.
	"""
	aggr = list()
	for i in range(0,n):
	sample = list_input_data
	target = list_output_data
	model = DecisionTreeClassifier()
	model.fit(sample[i],target[i])
	aggr.append(model)
	return aggr
	def pred_fun(x,aggr):
	""" Now pass the d2 set to each of these n models,
	now you will get n predictions for d2, from each of these models.
	"""
	pred_list=[]
	for m_i in range(len(aggr)):
	y_pred=aggr[m_i].predict(x)
	pred_list.append(y_pred)
	return pred_list
	def datafr(aggr,pred_list):
	""" Now using these n predictions creating a new dataset,
	"""
	meta_df = pd.DataFrame()
	for i in range(len(aggr)):
	meta_df['model'+str(i)]= pred_list[i]
	return meta_df
	def custom_fun(x,y,n):
	list_input_data,list_output_data = sample_gen(d1,y1,n)
	aggr = model_gen(list_input_data,list_output_data,n)
	pred_list = pred_fun(d2,aggr)
	meta_df = datafr(aggr,pred_list)
	#For D2,we already know it's corresponding target values, so now we train a meta model with these n predictions.
	xg_model= XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=10,objective='binary:logistic',silent=True, nthread=4)
	xg_model.fit(meta_df,y2)
	test_list = pred_fun(X_test,aggr) #Passing the test set to each of the base models and we will get 'n' predictions
	meta_test = datafr(aggr,test_list)#Create a new dataset with these 'n' predictions and passing it to the metamodel to get the final prediction
	y_pred = xg_model.predict(meta_test)
	return y_pred
	#Paramter tuning for the number of base models
	estimators = [50,100,150]
	for i in estimators:
	y_pred = custom_fun(X_train,y_train,i)
	print('Testing score for ', i, "base models is:",f1_score(y_test,y_pred, average='macro'))