Skip to content

Instantly share code, notes, and snippets.

@Steboss
Created October 21, 2020 11:25
Show Gist options
  • Save Steboss/74b3ca86aec16aa34347347cc3f88c69 to your computer and use it in GitHub Desktop.
Save Steboss/74b3ca86aec16aa34347347cc3f88c69 to your computer and use it in GitHub Desktop.
First Classification
word_vectorizer = TfidfVectorizer(
ngram_range=(1,4),
min_df=3,
max_df=0.9,
use_idf=True,
smooth_idf=True,
sublinear_tf=True,
analyzer='word',
token_pattern=r'\w{1,}',
max_features=50000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)
# Make a cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=43)
test_pred_word = 0
# Take not of the scores
oof_pred_word = np.zeros([train_target.shape[0],])
for i, (train_index, val_index) in tqdm(enumerate(kf.split(train_target))):
x_train, x_val = train_word_features[list(train_index)],train_word_features[list(val_index)]
y_train, y_val = train_target[train_index], train_target[val_index]
classifier = LogisticRegression(C=1, solver='sag')
classifier.fit(x_train, y_train)
val_preds = classifier.predict_proba(x_val)[:,1]
preds = classifier.predict_proba(test_word_features)[:,1]
test_pred_word += preds
oof_pred_word[val_index] = val_preds
print(f1_score(y_val, val_preds > 0.5))
print(roc_auc_score(y_val, val_preds, average=None))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment