Skip to content

Instantly share code, notes, and snippets.

@Erlemar
Last active August 13, 2019 13:17
Show Gist options
  • Save Erlemar/8c015423903b5ae78b5619ba1b5cb6e0 to your computer and use it in GitHub Desktop.
Save Erlemar/8c015423903b5ae78b5619ba1b5cb6e0 to your computer and use it in GitHub Desktop.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.tokenize import TweetTokenizer
import datetime
import lightgbm as lgb
from scipy import stats
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sub = pd.read_csv('data/sample_submission.csv')
# process data
def get_text(row):
splitted_text = row.text.lower().split('.')
indices = [splitted_text.index(j) for j in [i for i in splitted_text if row.drug in i]]
full_indices = []
for i in indices:
full_indices.append(i)
if i < len(splitted_text) -1:
full_indices.append(i + 1)
full_indices = list(set(full_indices))
full_text = []
for i in full_indices:
full_text.append(splitted_text[i])
return ' '.join(full_text[-10:])
train['new_text'] = train.apply(lambda row: get_text(row), axis=1)
test['new_text'] = test.apply(lambda row: get_text(row), axis=1)
# optimize hyperparameters
combined_features = FeatureUnion([('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
('tfidf_char', TfidfVectorizer(ngram_range=(1, 3), analyzer='char'))])
pipeline = Pipeline([("features", combined_features),
('clf', OneVsRestClassifier(LogisticRegression(class_weight='balanced')))])
parameters = {
'features__tfidf__max_df': (0.3, 0.75),
'features__tfidf_char__max_df': (0.3, 0.75),
'clf__estimator__C': (1.0, 10.0)
}
grid_search = GridSearchCV(pipeline, parameters, cv=folds,
n_jobs=-1, verbose=1, scoring='f1_macro')
grid_search.fit(train['new_text'], train['sentiment'])
# data sampling
X_tr, X_valid, y_tr, y_valid = train_test_split(train, y, test_size=0.2, stratify=y)
X_tr1 = X_tr.loc[X_tr['sentiment'] == 1, 'text'].append(X_tr.loc[X_tr['sentiment'] == 0, 'text']).append(X_tr.loc[X_tr['sentiment'] == 2, 'text'][:2000])
y_tr1 = X_tr.loc[X_tr['sentiment'] == 1, 'sentiment'].append(X_tr.loc[X_tr['sentiment'] == 0, 'sentiment']).append(X_tr.loc[X_tr['sentiment'] == 2, 'sentiment'][:2000])
ovr.fit(hstack((vectorizer.transform(X_tr1), vectorizer1.transform(X_tr1))), y_tr1)
# test predictions for semi-supervised
a = cross_val_predict(ovr, X, y, n_jobs=-1, cv=folds, method='predict_proba')
train['top_prediction'] = a.argmax(1)
train['top_prediction_prob'] = a.max(1)
n = 1000
new_train = train['new_text'].append(train.sort_values('top_prediction_prob', ascending=False)[:n]['new_text'])
new_train_vectorized = hstack((vectorizer.transform(new_train), vectorizer1.transform(new_train)))
y_new = train['sentiment'].append(train.sort_values('top_prediction_prob', ascending=False)[:n]['sentiment'])
# final model
logreg = LogisticRegression(class_weight='balanced')
ovr = OneVsRestClassifier(logreg)
scores = cross_val_score(ovr, new_train_vectorized, y_new, scoring='f1_macro', n_jobs=-1, cv=folds)
print('Cross-validation mean f1_score {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))
#submit
ovr.fit(new_train_vectorized, y_new)
sub['sentiment'] = ovr.predict(X_test)
sub.to_csv('sub.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment