Created
October 21, 2020 11:25
-
-
Save Steboss/74b3ca86aec16aa34347347cc3f88c69 to your computer and use it in GitHub Desktop.
First Classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
word_vectorizer = TfidfVectorizer( | |
ngram_range=(1,4), | |
min_df=3, | |
max_df=0.9, | |
use_idf=True, | |
smooth_idf=True, | |
sublinear_tf=True, | |
analyzer='word', | |
token_pattern=r'\w{1,}', | |
max_features=50000) | |
word_vectorizer.fit(all_text) | |
train_word_features = word_vectorizer.transform(train_text) | |
test_word_features = word_vectorizer.transform(test_text) | |
# Make a cross validation | |
kf = KFold(n_splits=5, shuffle=True, random_state=43) | |
test_pred_word = 0 | |
# Take not of the scores | |
oof_pred_word = np.zeros([train_target.shape[0],]) | |
for i, (train_index, val_index) in tqdm(enumerate(kf.split(train_target))): | |
x_train, x_val = train_word_features[list(train_index)],train_word_features[list(val_index)] | |
y_train, y_val = train_target[train_index], train_target[val_index] | |
classifier = LogisticRegression(C=1, solver='sag') | |
classifier.fit(x_train, y_train) | |
val_preds = classifier.predict_proba(x_val)[:,1] | |
preds = classifier.predict_proba(test_word_features)[:,1] | |
test_pred_word += preds | |
oof_pred_word[val_index] = val_preds | |
print(f1_score(y_val, val_preds > 0.5)) | |
print(roc_auc_score(y_val, val_preds, average=None)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment