Skip to content

Instantly share code, notes, and snippets.

@stanlee321
Last active September 16, 2020 00:44
Show Gist options
  • Save stanlee321/74c9798ae882e72bf05275fb2e0675be to your computer and use it in GitHub Desktop.
Save stanlee321/74c9798ae882e72bf05275fb2e0675be to your computer and use it in GitHub Desktop.
Simple text classifier
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
## Input data
X = df['titletext']
y = df['label']
cv = CountVectorizer()
X = cv.fit_transform(X) # Fit the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
# Check Results
cm = confusion_matrix(y_test, y_pred, labels=[1,0])
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')
ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
ax.yaxis.set_ticklabels(['FAKE', 'REAL'])
# Using a pipeline for grid search for best model
stopwords_sp = [line.rstrip('\n') for line in open("stopwords-es.txt")]
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
('vect', CountVectorizer(stop_words = stopwords_sp)),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier()),
])
parameters = {
'vect__max_df': (0.5, 0.75, 1.0),
# 'vect__max_features': (None, 5000, 10000, 50000),
'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams
# 'tfidf__use_idf': (True, False),
# 'tfidf__norm': ('l1', 'l2'),
'clf__max_iter': (20,),
'clf__alpha': (0.00001, 0.000001),
'clf__penalty': ('l2', 'elasticnet'),
# 'clf__max_iter': (10, 50, 80),
}
# multiprocessing requires the fork to happen in a __main__ protected
# block
X = df['titletext']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
# predict
predicted = grid_search.predict(X_test)
np.mean(predicted == y_test)
# Result
cm = confusion_matrix(y_test, predicted, labels=[1,0])
print(classification_report(y_test, predicted))
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted Labels')
ax.set_ylabel('True Labels')
ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
ax.yaxis.set_ticklabels(['FAKE', 'REAL'])
# Save/Load model
import joblib
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(grid_search, filename)
# some time later...
# load the model from disk
grid_search = joblib.load(filename)
predicted = grid_search.predict(X_test)
np.mean(predicted == y_test)
# Result
grid_search.predict(["estoy enojado"])[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment