Last active
May 30, 2017 18:27
-
-
Save abarmat/8ac1f90a5c81927110966269d3502f52 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint | |
import re | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn import metrics, cross_validation | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.feature_extraction import DictVectorizer | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.pipeline import Pipeline, FeatureUnion | |
from sklearn.svm import LinearSVC, SVC | |
from nltk.stem.snowball import SpanishStemmer | |
from nltk.corpus import stopwords | |
import xgboost | |
class MultiItemSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, key): | |
self.key = key | |
def fit(self, x, y=None): | |
return self | |
def transform(self, data_dict): | |
return [{self.key: e} for idx, e in data_dict[self.key].iteritems()] | |
class ItemSelector(BaseEstimator, TransformerMixin): | |
def __init__(self, key): | |
self.key = key | |
def fit(self, x, y=None): | |
return self | |
def transform(self, data_dict): | |
return data_dict[self.key] | |
def read_file(filename): | |
d = pd.read_csv(filename, sep=';') | |
return d.ix[:,:-1], d['Clase'] | |
def save_prediction(filename, data): | |
with open(filename, 'w') as f: | |
for item in data: | |
proba = map(lambda e: str(round(e, 4)), item[1]) | |
f.write(';'.join([str(item[0])] + proba) + '\n') | |
def parse_doc(title, body): | |
title = '' if title is np.nan else title | |
body = '' if body is np.nan else body | |
doc = (title + ' ' + body).strip() | |
return doc.decode('latin-1') | |
def process(X, y): | |
# Feature selection | |
attr_list = ['tit', 'des'] | |
# Convert to array | |
X_body = [parse_doc(e[0], e[1]) for e in X[attr_list].values.tolist()] | |
X['body'] = pd.Series(X_body, index=X.index) | |
y = np.array(y) | |
return (X, y) | |
def show_feature_importance(clf): | |
fn = clf.named_steps['vec'].get_feature_names() | |
fn = np.asarray(fn) | |
for class_id in range(5): | |
top = np.argsort(clf.named_steps['clf'].coef_[class_id])[-10:] | |
print('[{}]'.format(class_id + 1)) | |
for idx, feature in enumerate(fn[top]): | |
print('+ {} : {}'.format(feature, top[idx])) | |
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues): | |
plt.imshow(cm, interpolation='nearest', cmap=cmap) | |
plt.title(title) | |
plt.colorbar() | |
tick_marks = np.arange(len(classes)) | |
plt.xticks(tick_marks, classes, rotation=45) | |
plt.yticks(tick_marks, classes) | |
plt.tight_layout() | |
plt.ylabel('True label') | |
plt.xlabel('Predicted label') | |
class CustomTokenizer(object): | |
def __init__(self): | |
self.stemmer = SpanishStemmer() | |
def __call__(self, doc): | |
pattern = re.compile(r"(?u)\b[a-zA-Z][a-zA-Z]+\b") | |
tokens = pattern.findall(doc) | |
tokens = [token.lower() for token in tokens] | |
# tokens = [self.stemmer.stem(token) for token in tokens] | |
return tokens | |
def main(): | |
rng = np.random.RandomState(100) | |
# Options | |
opt_cross_validate = False | |
opt_show_report = True | |
opt_save_prediction = True | |
opt_show_feature_importance = False | |
# Read data | |
X_train, y_train = read_file('tp2-work.train-1.csv') | |
X_test, y_test = read_file('tp2-work.test-1.csv') | |
# Preprocess | |
X_train, y_train = process(X_train, y_train) | |
X_test, y_test = process(X_test, y_test) | |
txt_fields = 'body' | |
num_fields = ['anio'] | |
# Classification pipeline | |
vectorizer = CountVectorizer( | |
ngram_range=(2,4), | |
strip_accents='ascii', | |
lowercase=False, | |
stop_words=stopwords.words('spanish') + ['argentina', 'id', 'mls'], | |
tokenizer=CustomTokenizer(), | |
max_df=0.10 | |
) | |
clf = Pipeline([ | |
('union', FeatureUnion(transformer_list=[ | |
('num', Pipeline([ | |
('sel', MultiItemSelector(num_fields)), | |
('dic', DictVectorizer(sparse=False)) | |
])), | |
('txt', Pipeline([ | |
('sel', ItemSelector(txt_fields)), | |
('vec', vectorizer) | |
])) | |
])), | |
('clf', SVC(probability=True, kernel='linear')) | |
]) | |
# Cross-validate | |
if opt_cross_validate: | |
k_fold = cross_validation.KFold(n=len(X_train), n_folds=10, random_state=rng) | |
results = cross_validation.cross_val_score( | |
clf, X_train, y_train, cv=k_fold, n_jobs=-1 | |
) | |
print(results) | |
print(sum(results)/len(results)) | |
# Fit | |
# X_what = clf.named_steps['txt'].named_steps['sel'].fit_transform(X_train) | |
# print X_what | |
# print X_what.shape[0], y_train.shape[0] | |
# X_what = clf.named_steps['txt'].named_steps['dic'].fit_transform(X_what) | |
# print X_what.shape[0], y_train.shape[0] | |
# print(clf.named_steps['clf'].fit(X_what, y_train)) | |
clf = clf.fit(X_train, y_train) | |
# Test | |
y_pred = clf.predict(X_test) | |
if opt_show_report: | |
# Model performance | |
accuracy = metrics.accuracy_score(y_test, y_pred) | |
print(accuracy) | |
# Confusion Matrix | |
cm = metrics.confusion_matrix(y_test, y_pred) | |
print(cm) | |
if False: | |
plt.figure() | |
plot_confusion_matrix(cm, range(1,6)) | |
plt.show() | |
report = metrics.classification_report(y_test, y_pred) | |
print(report) | |
if opt_show_feature_importance: | |
show_feature_importance(clf) | |
if opt_save_prediction: | |
save_prediction('svm-pred-test.csv', zip(y_pred, clf.predict_proba(X_test))) | |
# Feature importance | |
if False: | |
feat_imp_vals = clf.named_steps['clf'].booster().get_fscore() | |
feat_names = clf.named_steps['vec'].get_feature_names() | |
feat_imp = { | |
f: feat_imp_vals.get('f{}'.format(idx), 0) for idx, f in enumerate(feat_names) | |
} | |
total = np.array(feat_imp.values()).sum() | |
A = {k:v/float(total) for k,v in feat_imp.items()} | |
B = sorted(A.items(), key=lambda e: e[1], reverse=True)[:20] | |
# for e in B: | |
# print e[0] | |
df = pd.DataFrame(sorted(B, key=lambda e: e[1]), columns=['feature', 'fscore']) | |
plt.figure() | |
df.plot() | |
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(20, 20)) | |
plt.title('XGBoost Feature Importance') | |
plt.xlabel('relative importance') | |
plt.gcf().savefig('feature_importance_xgb.png') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment