Last active
October 13, 2015 16:38
-
-
Save lesteve/66cb8ed3b9cb4cfb40db to your computer and use it in GitHub Desktop.
Reproducing scikit-learn issue #5136
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
from sklearn.naive_bayes import MultinomialNB | |
import unicodedata | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cross_validation import train_test_split | |
path='./BUG-SKLEARN-MNB/' | |
#%% load et normalisation des donnees | |
# def norm(x): | |
# return unicodedata.normalize('NFKD', str(x).lower()).encode('ASCII', 'ignore').decode('ASCII') | |
#load de la donnee light | |
tr_df=pd.read_csv(path+'training_norm_light_th100_2.csv',sep=";",encoding='utf-8') | |
tr_df.Marque = tr_df.Marque.fillna('AUCUNE') | |
un_cat3=tr_df.Categorie3.unique() | |
rep=pd.Series(range(tr_df.Categorie3.unique().shape[0]),index=tr_df.Categorie3.unique()) | |
y=tr_df.Categorie3.map(rep) | |
te_df=pd.read_csv(path+'test.csv',sep=";",encoding='utf-8') | |
# te_df['Description']=te_df.Description.map(norm) | |
# te_df['Libelle']=te_df.Libelle.map(norm) | |
te_df.Libelle = te_df.Libelle.fillna('AUCUNE') | |
te_df.Marque = te_df.Marque.fillna('AUCUNE') | |
#iterateur generique | |
from sklearn.pipeline import Pipeline | |
import time | |
import sys | |
import scipy | |
# from nltk.classify.maxent import MaxentClassifier | |
from sklearn.base import clone | |
from sklearn.metrics.pairwise import cosine_similarity | |
r=np.random.RandomState(42) | |
def draw(a,size): | |
aa=r.permutation(a) | |
return [aa[i%len(aa)] for i in range(size)] | |
classes=y.unique() | |
one_te=list(pd.DataFrame(y[:]).groupby("Categorie3").apply(lambda x: r.choice(x.index,1)[0])) | |
batch_size=5 | |
batch_nb=1 | |
cur_set=set(tr_df.index) | |
cur_set=cur_set.difference(one_te) | |
mnb_iter2=MultinomialNB(fit_prior=False,alpha=0.25) | |
bycat3=pd.DataFrame(y[list(cur_set)]).groupby("Categorie3").apply(lambda x:draw(x.index,batch_size*batch_nb)) | |
HV=TfidfVectorizer(sublinear_tf=True,ngram_range=(1, 1),max_features=None,stop_words=None) | |
HV.fit(te_df.ix[:].apply(lambda x: x.Description + ' ' + x.Marque + ' ' + x.Libelle,axis=1)) | |
ind_all=[xx for x in bycat3 for xx in x] | |
for mod in [("mnb_iter4",mnb_iter2)]: | |
cl=mod[1] | |
#cl2=clone(cl) | |
for e in range(0,0+batch_nb): | |
t0=time.time() | |
ind=[] | |
for b in range(batch_size): | |
ind.extend(list(bycat3.apply(lambda x:x[(e*batch_size+b)]))) | |
ind=list(set(ind)) | |
print('fitting tfidf on hash vector') | |
sys.stdout.flush() | |
X=HV.transform(tr_df.ix[ind].apply(lambda x: x.Description + ' ' + x.Marque+ ' ' + x.Libelle ,axis=1)) | |
X_te=HV.transform(tr_df.ix[one_te].apply(lambda x: x.Description + ' ' + x.Marque+ ' ' + x.Libelle,axis=1)) | |
print('tfidf on hash vector fitted') | |
print('X size is ', X.shape) | |
t1=time.time() | |
print(t1-t0, 'sec' ) | |
sys.stdout.flush() | |
#tr=range(len(ind)) | |
print('fitting model') | |
sys.stdout.flush() | |
a=y[ind].value_counts() | |
w=1/a*max(a) | |
weights=[w[x] for x in y[ind]] | |
cl.fit(X,y[ind].values,sample_weight=weights) | |
#cl.partial_fit(X,y[ind].values,classes=classes,sample_weight=weights) | |
print('model fitted') | |
t2=time.time() | |
print(t2-t1, 'sec') | |
sys.stdout.flush() | |
prob_te=cl.predict_proba(X_te) | |
pred_via_prob=cl.classes_[prob_te.argmax(axis=1)] | |
pred=cl.predict(X_te) | |
#score methode 1 | |
print("score methode 1", cl.score(X_te,y[one_te])) | |
#score methode 2 | |
print("score methode 1", np.mean(pred==y[one_te])) | |
#score methode 3 | |
print("score methode 1", np.mean(pred_via_prob==y[one_te])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment