Created
August 7, 2023 14:22
-
-
Save matthewlenz/32a978fce11850cb027f35003c04df39 to your computer and use it in GitHub Desktop.
Richard Gruss - Text Classification With Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note: This will probably require some tweaking. Dr. Gruss sent this to me but said he hasn't used it in a very long time. | |
# Video link: https://www.youtube.com/watch?v=EfEW3_RLnGA | |
import os | |
import random | |
import string | |
from nltk import word_tokenize | |
from collections import defaultdict | |
from nltk import FreqDist | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn import metrics | |
import pickle | |
stop_words = set(stopwords.words('english')) | |
stop_words.add('said') | |
stop_words.add('mr') | |
BASE_DIR = '/Users/rgruss/workspace/AI/nlp/bbc' | |
LABELS = ['business', 'entertainment', 'politics', 'sport', 'tech'] | |
def create_data_set(): | |
with open('data.txt', 'w', encoding='utf8') as outfile: | |
for label in LABELS: | |
dir = '%s/%s' % (BASE_DIR, label) | |
for filename in os.listdir(dir): | |
fullfilename = '%s/%s' % (dir, filename) | |
print(fullfilename) | |
with open(fullfilename, 'rb') as file: | |
text = file.read().decode(errors='replace').replace('\n', '') | |
outfile.write('%s\t%s\t%s\n' % (label, filename, text)) | |
def setup_docs(): | |
docs = [] # (label, text) | |
with open('data.txt', 'r', encoding='utf8') as datafile: | |
for row in datafile: | |
parts = row.split('\t') | |
doc = (parts[0], parts[2].strip()) | |
docs.append(doc) | |
return docs | |
def clean_text(text): | |
# remove punctuation | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
# convert to lower case | |
text = text.lower() | |
return text | |
def get_tokens(text): | |
# get individual words | |
tokens = word_tokenize(text) | |
# remove common words that are useless | |
tokens = [t for t in tokens if not t in stop_words] | |
return tokens | |
def print_frequency_dist(docs): | |
tokens = defaultdict(list) | |
# lets make a giant list of all the words for each category | |
for doc in docs: | |
doc_label = doc[0] | |
doc_text = clean_text(doc[1]) | |
doc_tokens = get_tokens(doc_text) | |
tokens[doc_label].extend(doc_tokens) | |
for category_label, category_tokens in tokens.items(): | |
print(category_label) | |
fd = FreqDist(category_tokens) | |
print(fd.most_common(20)) | |
def get_splits(docs): | |
# scramble docs | |
random.shuffle(docs) | |
X_train = [] #training documents | |
y_train = [] #corresponding training labels | |
X_test = [] #test documents | |
y_test = [] #correspoding test label | |
pivot = int(.80 * len(docs)) | |
for i in range(0, pivot): | |
X_train.append(docs[i][1]) | |
y_train.append(docs[i][0]) | |
for i in range(pivot, len(docs)): | |
X_test.append(docs[i][1]) | |
y_test.append(docs[i][0]) | |
return X_train, X_test, y_train, y_test | |
def evaluate_classifier(title, classifier, vectorizer, X_test, y_test): | |
X_test_tfidf = vectorizer.transform(X_test) | |
y_pred = classifier.predict(X_test_tfidf) | |
precision = metrics.precision_score(y_test, y_pred) | |
recall = metrics.recall_score(y_test, y_pred) | |
f1 = metrics.f1_score(y_test, y_pred) | |
print("%s\t%f\t%f\t%f\n" % (title, precision, recall, f1)) | |
def train_classifier(docs): | |
X_train, X_test, y_train, y_test = get_splits(docs) | |
# the object that turns text into vectors | |
vectorizer = CountVectorizer(stop_words='english', | |
ngram_range=(1, 3), | |
min_df=3, analyzer='word') | |
# create doc-term matrix | |
dtm = vectorizer.fit_transform(X_train) | |
# train Naive Bayes classifier | |
naive_bayes_classifier = MultinomialNB().fit(dtm, y_train) | |
evaluate_classifier("Naive Bayes\tTRAIN\t", naive_bayes_classifier, vectorizer, X_train, y_train) | |
evaluate_classifier("Naive Bayes\tTEST\t", naive_bayes_classifier, vectorizer, X_test, y_test) | |
#store the classifier | |
clf_filename = 'naive_bayes_classifier.pkl' | |
pickle.dump(naive_bayes_classifier, open(clf_filename, 'wb')) | |
#also store the vectorizer so we can transform new data | |
vec_filename = 'count_vectorizer.pkl' | |
pickle.dump(vectorizer, open(vec_filename, 'wb')) | |
def classify(text): | |
# load classifier | |
clf_filename = 'naive_bayes_classifier.pkl' | |
nb_clf = pickle.load(open(clf_filename, 'rb')) | |
# vectorize the new text | |
vec_filename = 'count_vectorizer.pkl' | |
vectorizer = pickle.load(open(vec_filename, 'rb')) | |
pred = nb_clf.predict(vectorizer.transform([text])) | |
print(pred[0]) | |
if __name__ == '__main__': | |
# docs = create_data_set() | |
docs = setup_docs() | |
# word frequencies | |
#print_frequency_dist(docs) | |
train_classifier(docs) | |
#new_tech_doc = "Google showed off some new camera features on the Pixel 4 today at its annual hardware event, focusing on improvements to its Live HDR and Night Sight mode. The back of the Pixel 4 houses dual cameras in a new subtle square camera bump. There’s a 12.2MP main camera and a 16MP telephoto lens, which is a hybrid of optical and digital zoom. New Pixel 4 features include Live HDR+, with dual exposure controls in the viewfinder, which shows how photos will look in real time. There are HDR sliders to adjust brightness and shadows when you compose. A learning-based white balance feature is applied to all photo modes, so shots come out with true-to-life colors." | |
#classify(new_tech_doc) | |
#new_entertainment_doc = "Scarlett Johnasson is working much longer than nine to five! With two films coming out this year, another film currently in production, a wedding to plan on the horizon, and a five-year-old daughter to raise, she has a full plate! The 34-year-old actress still managed to hit the red carpet at this year's Elle Women In Hollywood Awards on Monday, and talk with ET's Nischelle Turner about how she manages to find any semblance of a work-life balance as well as reacting to Dolly Parton requesting that she play her in a biopic." | |
#classify(new_entertainment_doc) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment