Skip to content

Instantly share code, notes, and snippets.

@ssisaias
Last active July 2, 2018 02:49
Show Gist options
  • Save ssisaias/7144d80de92bed9e191b3e0d64809474 to your computer and use it in GitHub Desktop.
Save ssisaias/7144d80de92bed9e191b3e0d64809474 to your computer and use it in GitHub Desktop.
import nltk
import csv
import pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from myclassify import Analise # this should be found here: https://gist.github.com/ssisaias/fc49e7983a244b8c29b8f069f263216a
from pandas import DataFrame
import numpy
#This are the steps I took to create a MultinomialNB Classifier for my graduation thesis.
dados = []
x = []
y = []
with open('classification_group.txt','r',encoding='utf8') as file:
reader = csv.reader(file)
for row in reader:
dados.append((row[0],row[1]))
preprocessor = Analise()
preprocessor.stop_words = set(stopwords.words("portuguese"))
preprocessor.stopwordsnltk = nltk.corpus.stopwords.words('portuguese')
treino = preprocessor.removerStopWords(dados)
treino = preprocessor.aplicastemmer(treino)
""" x = []
for item in treino:
x.append(item[0]) """
x = []
for item in treino:
frase = ""
for witem in item[0]:
frase += witem + " "
x.append(frase)
y = []
for item in treino:
y.append(item[1])
# using pandas
rows = []
for index,item in enumerate(x):
rows.append({'text': x[index], 'class': y[index]})
data_frame = DataFrame(rows)
# shuffle data
data = data_frame.reindex(numpy.random.permutation(data_frame.index))
# Build the classifier
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts,targets)
## QUICK VALIDATION
teste = ["Vagabunda","Filho da puta","Ele me ama","lindo"]
teste_counts = count_vectorizer.transform(teste)
predictions = classifier.predict(teste_counts)
predictions
markups = ['1','1','0','0']
diferencas = predictions == markups
acertos = [a for a in diferencas if a]
rate = 100.0 * len(acertos)/len(predictions)
rate
## Precision, recall, fscore, support
# recall
from sklearn.metrics import recall_score
recall_score(markups, predictions,average='macro')
recall_score(markups, predictions,average='micro')
recall_score(markups, predictions,average='weighted')
# precision
from sklearn.metrics import precision_score
precision_score(markups, predictions, average='macro')
precision_score(markups, predictions,average='micro')
precision_score(markups, predictions,average='weighted')
# Save the model and the vectorizer
with open('classifier.pickle', 'wb') as handle:
pickle.dump(classifier,handle,protocol=pickle.HIGHEST_PROTOCOL)
with open('count_vectorizer.pickle', 'wb') as handle:
pickle.dump(count_vectorizer,handle,protocol=pickle.HIGHEST_PROTOCOL)
#Carregar
#with open('classifier.pickle', 'rb') as handle:
# classifier = pickle.load(handle)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment