Skip to content

Instantly share code, notes, and snippets.

@Awuor87
Created April 4, 2017 15:35
Show Gist options
  • Save Awuor87/fcdc5b44b793791714ddf4beff4cc9f8 to your computer and use it in GitHub Desktop.
Save Awuor87/fcdc5b44b793791714ddf4beff4cc9f8 to your computer and use it in GitHub Desktop.
Data Science in Python
import glob
import codecs
import numpy
from pandas import DataFrame
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
# To use the Health data set, uncomment the next 4 lines
SOURCES = [
('HealthProNonPro\\NonPro\\*.txt', 'BAD'),
('HealthProNonPro\\Pro\\*.txt', 'GOOD')
]
# read documents from corpus
def read_files (path):
files = glob.glob(path)
for file in files:
# use Unicode text encoding and ignore any errors
with codecs.open(file, "r", encoding='utf-8', errors='ignore') as f:
text = f.read()
text = text.replace('\n', ' ')
yield file, text
# put corpus in data frame format for easy manipulation
def build_data_frame(path, classification):
rows = []
index = []
for file_name, text in read_files(path):
rows.append({'text': text, 'class': classification})
index.append(file_name)
data_frame = DataFrame(rows, index=index)
return data_frame
# read the corpus data
data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
data = data.append(build_data_frame(path, classification))
# randomize corpus data
data = data.reindex(numpy.random.permutation(data.index))
# create the data trasformation and classification pipeline
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
# http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html
pipeline = Pipeline([
('vect', CountVectorizer(stop_words=None,lowercase=True)),
('tfidf', TfidfTransformer(use_idf=True)),
('clf', MultinomialNB(alpha=1))
])
# do k-fold cross-validation
# https://en.wikipedia.org/wiki/Cross-validation_(statistics)
# http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
train_text = data.iloc[train_indices]['text'].values
train_y = data.iloc[train_indices]['class'].values.astype(str)
test_text = data.iloc[test_indices]['text'].values
test_y = data.iloc[test_indices]['class'].values.astype(str)
pipeline.fit(train_text, train_y)
predictions = pipeline.predict(test_text)
confusion += confusion_matrix(test_y, predictions)
score = f1_score(test_y, predictions, pos_label='GOOD')
scores.append(score)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment