Last active
September 16, 2020 00:44
-
-
Save stanlee321/74c9798ae882e72bf05275fb2e0675be to your computer and use it in GitHub Desktop.
Simple text classifier
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.feature_extraction.text import TfidfTransformer | |
| from sklearn.linear_model import SGDClassifier | |
| from sklearn.model_selection import GridSearchCV | |
| from sklearn.pipeline import Pipeline | |
| from pprint import pprint | |
| from time import time | |
| from sklearn.metrics import classification_report, confusion_matrix | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| %matplotlib inline | |
| ## Input data | |
| X = df['titletext'] | |
| y = df['label'] | |
| cv = CountVectorizer() | |
| X = cv.fit_transform(X) # Fit the Data | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) | |
| #Naive Bayes Classifier | |
| clf = MultinomialNB() | |
| clf.fit(X_train,y_train) | |
| clf.score(X_test,y_test) | |
| y_pred = clf.predict(X_test) | |
| print(classification_report(y_test, y_pred)) | |
| # Check Results | |
| cm = confusion_matrix(y_test, y_pred, labels=[1,0]) | |
| ax= plt.subplot() | |
| sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d") | |
| ax.set_title('Confusion Matrix') | |
| ax.set_xlabel('Predicted Labels') | |
| ax.set_ylabel('True Labels') | |
| ax.xaxis.set_ticklabels(['FAKE', 'REAL']) | |
| ax.yaxis.set_ticklabels(['FAKE', 'REAL']) | |
| # Using a pipeline for grid search for best model | |
| stopwords_sp = [line.rstrip('\n') for line in open("stopwords-es.txt")] | |
| from sklearn.pipeline import Pipeline | |
| pipeline = Pipeline([ | |
| ('vect', CountVectorizer(stop_words = stopwords_sp)), | |
| ('tfidf', TfidfTransformer()), | |
| ('clf', SGDClassifier()), | |
| ]) | |
| parameters = { | |
| 'vect__max_df': (0.5, 0.75, 1.0), | |
| # 'vect__max_features': (None, 5000, 10000, 50000), | |
| 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams | |
| # 'tfidf__use_idf': (True, False), | |
| # 'tfidf__norm': ('l1', 'l2'), | |
| 'clf__max_iter': (20,), | |
| 'clf__alpha': (0.00001, 0.000001), | |
| 'clf__penalty': ('l2', 'elasticnet'), | |
| # 'clf__max_iter': (10, 50, 80), | |
| } | |
| # multiprocessing requires the fork to happen in a __main__ protected | |
| # block | |
| X = df['titletext'] | |
| y = df['label'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) | |
| # find the best parameters for both the feature extraction and the | |
| # classifier | |
| grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1) | |
| print("Performing grid search...") | |
| print("pipeline:", [name for name, _ in pipeline.steps]) | |
| print("parameters:") | |
| pprint(parameters) | |
| t0 = time() | |
| grid_search.fit(X_train, y_train) | |
| print("done in %0.3fs" % (time() - t0)) | |
| print() | |
| print("Best score: %0.3f" % grid_search.best_score_) | |
| print("Best parameters set:") | |
| best_parameters = grid_search.best_estimator_.get_params() | |
| for param_name in sorted(parameters.keys()): | |
| print("\t%s: %r" % (param_name, best_parameters[param_name])) | |
| # predict | |
| predicted = grid_search.predict(X_test) | |
| np.mean(predicted == y_test) | |
| # Result | |
| cm = confusion_matrix(y_test, predicted, labels=[1,0]) | |
| print(classification_report(y_test, predicted)) | |
| ax= plt.subplot() | |
| sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d") | |
| ax.set_title('Confusion Matrix') | |
| ax.set_xlabel('Predicted Labels') | |
| ax.set_ylabel('True Labels') | |
| ax.xaxis.set_ticklabels(['FAKE', 'REAL']) | |
| ax.yaxis.set_ticklabels(['FAKE', 'REAL']) | |
| # Save/Load model | |
| import joblib | |
| # save the model to disk | |
| filename = 'finalized_model.sav' | |
| joblib.dump(grid_search, filename) | |
| # some time later... | |
| # load the model from disk | |
| grid_search = joblib.load(filename) | |
| predicted = grid_search.predict(X_test) | |
| np.mean(predicted == y_test) | |
| # Result | |
| grid_search.predict(["estoy enojado"])[0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment