Skip to content

Instantly share code, notes, and snippets.

y_test_probas = np.empty((X_test.shape[0], 5))
model= xgb.XGBClassifier(learning_rate=0.5,max_depth=4,n_estimators=10,reg_alpha=0.01,reg_lambda=1.0,random_state=42)
model=model.fit(X_train,y_train)
for i in range(5):
y_test_probas[:,i] = model.predict_proba(X_test)[:,1]
#taking mean of all the predicted
y_test_proba = np.mean(y_test_probas, axis=1)
# Parameters to tune for XGBoost model
params = {'n_estimators': [10, 50, 100, 500, 1000],
'max_depth': [2, 3, 4, 5, 6],
'learning_rate': [0.0001,0.005,0.001,0.05, 0.1],
'reg_alpha': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2],
'reg_lambda': [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]}
# Create a custom (MCC) metric for evaluation of the model performance while
# hyperparameter tuning the XGBoost model
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Parameters to tune for XGBoost model
params = {'n_estimators': [10, 50, 100, 500, 1000],
'learning_rate': [0.0001,0.005,0.001,0.05, 0.1]}
# Create a custom (MCC) metric for evaluation of the model performance while
# hyperparameter tuning the XGBoost model
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
ada_clf = AdaBoostClassifier(random_state=10)
# Parameters to tune for random forest model
params = {'n_estimators': [10, 50, 100, 500, 1000],
'max_depth': [2, 3, 4, 5, 6],
'min_samples_split': [0.02, 0.04, 0.08, 0.16, 0.32, 0.50]}
# Create a custom (MCC) metric for evaluation of the model performance while
# hyperparameter tuning the XGBoost model
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
# hyperparameter tuning the DecisionTree model
params ={'max_depth':[1, 5, 10, 50],'min_samples_split':[5, 10, 100, 500]}
# Create a custom (MCC) metric for evaluation of the model performance while
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
dt_clf = tree.DecisionTreeClassifier(random_state=42, class_weight='balanced')
# hyperparameter tuning the XGBoost model
params ={'C':[10 ** x for x in range(-5, 3)],'gamma':[10 ** x for x in range(-5, 3)]}
# Create a custom (MCC) metric for evaluation of the model performance while
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
svm_clf = svm.SVC(random_state=42,kernel='rbf',class_weight='balanced')
# hyperparameter tuning the XGBoost model
params ={'C':[10 ** x for x in range(-5, 3)]}
# Create a custom (MCC) metric for evaluation of the model performance while
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
svm_clf = svm.SVC(random_state=42, class_weight='balanced')
# Parameters to tune for LR model
params = {'C': [10**x for x in range(-5,6)]}
# Create a custom (MCC) metric for evaluation of the model performance while
# hyperparameter tuning the XGBoost model
mcc = make_scorer(matthews_corrcoef, greater_is_better=True)
# Create an XGBoost classifier object with log-loss as the loss function to minimize
log_clf = LogisticRegression(random_state=42, class_weight='balanced')
def data_preparation(start, end,praq_train):
# load a piece of data from file
praq_train = pq.read_pandas('/content/train.parquet', columns=[str(i) for i in range(start, end)]).to_pandas()
X = []
y = []
# using tdqm to evaluate processing time
# takes each index from df_train and iteract it from start to end
# it is divided by 3 because for each id_measurement there are 3 id_signal, and the start/end parameters are id_signal
for id_measurement in tqdm(df_metadata_train.index.levels[0].unique()[int(start/3):int(end/3)]):
X_signal = []
def transform_signal(signal, n_dim=160, min_max=(-1,1)):
# convert data into -1 to 1
signal_std = standardize_data(signal, min_data=min_num, max_data=max_num)
# bucket or chunk size, 5000 in this case (800000 / 160)
bucket_size = int(800000 / n_dim)
# new_ts will be the container of the new data
new_signal = []
# this for iteract any chunk/bucket until reach the whole sample_size (800000)
for i in range(0, 800000 , bucket_size):
# cut each bucket to ts_range