Skip to content

Instantly share code, notes, and snippets.

@lesteve
Last active October 13, 2015 16:38
Show Gist options
  • Save lesteve/66cb8ed3b9cb4cfb40db to your computer and use it in GitHub Desktop.
Save lesteve/66cb8ed3b9cb4cfb40db to your computer and use it in GitHub Desktop.
Reproducing scikit-learn issue #5136
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
path='./BUG-SKLEARN-MNB/'
#%% load et normalisation des donnees
# def norm(x):
# return unicodedata.normalize('NFKD', str(x).lower()).encode('ASCII', 'ignore').decode('ASCII')
#load de la donnee light
tr_df=pd.read_csv(path+'training_norm_light_th100_2.csv',sep=";",encoding='utf-8')
tr_df.Marque = tr_df.Marque.fillna('AUCUNE')
un_cat3=tr_df.Categorie3.unique()
rep=pd.Series(range(tr_df.Categorie3.unique().shape[0]),index=tr_df.Categorie3.unique())
y=tr_df.Categorie3.map(rep)
te_df=pd.read_csv(path+'test.csv',sep=";",encoding='utf-8')
# te_df['Description']=te_df.Description.map(norm)
# te_df['Libelle']=te_df.Libelle.map(norm)
te_df.Libelle = te_df.Libelle.fillna('AUCUNE')
te_df.Marque = te_df.Marque.fillna('AUCUNE')
#iterateur generique
from sklearn.pipeline import Pipeline
import time
import sys
import scipy
# from nltk.classify.maxent import MaxentClassifier
from sklearn.base import clone
from sklearn.metrics.pairwise import cosine_similarity
r=np.random.RandomState(42)
def draw(a,size):
aa=r.permutation(a)
return [aa[i%len(aa)] for i in range(size)]
classes=y.unique()
one_te=list(pd.DataFrame(y[:]).groupby("Categorie3").apply(lambda x: r.choice(x.index,1)[0]))
batch_size=5
batch_nb=1
cur_set=set(tr_df.index)
cur_set=cur_set.difference(one_te)
mnb_iter2=MultinomialNB(fit_prior=False,alpha=0.25)
bycat3=pd.DataFrame(y[list(cur_set)]).groupby("Categorie3").apply(lambda x:draw(x.index,batch_size*batch_nb))
HV=TfidfVectorizer(sublinear_tf=True,ngram_range=(1, 1),max_features=None,stop_words=None)
HV.fit(te_df.ix[:].apply(lambda x: x.Description + ' ' + x.Marque + ' ' + x.Libelle,axis=1))
ind_all=[xx for x in bycat3 for xx in x]
for mod in [("mnb_iter4",mnb_iter2)]:
cl=mod[1]
#cl2=clone(cl)
for e in range(0,0+batch_nb):
t0=time.time()
ind=[]
for b in range(batch_size):
ind.extend(list(bycat3.apply(lambda x:x[(e*batch_size+b)])))
ind=list(set(ind))
print('fitting tfidf on hash vector')
sys.stdout.flush()
X=HV.transform(tr_df.ix[ind].apply(lambda x: x.Description + ' ' + x.Marque+ ' ' + x.Libelle ,axis=1))
X_te=HV.transform(tr_df.ix[one_te].apply(lambda x: x.Description + ' ' + x.Marque+ ' ' + x.Libelle,axis=1))
print('tfidf on hash vector fitted')
print('X size is ', X.shape)
t1=time.time()
print(t1-t0, 'sec' )
sys.stdout.flush()
#tr=range(len(ind))
print('fitting model')
sys.stdout.flush()
a=y[ind].value_counts()
w=1/a*max(a)
weights=[w[x] for x in y[ind]]
cl.fit(X,y[ind].values,sample_weight=weights)
#cl.partial_fit(X,y[ind].values,classes=classes,sample_weight=weights)
print('model fitted')
t2=time.time()
print(t2-t1, 'sec')
sys.stdout.flush()
prob_te=cl.predict_proba(X_te)
pred_via_prob=cl.classes_[prob_te.argmax(axis=1)]
pred=cl.predict(X_te)
#score methode 1
print("score methode 1", cl.score(X_te,y[one_te]))
#score methode 2
print("score methode 1", np.mean(pred==y[one_te]))
#score methode 3
print("score methode 1", np.mean(pred_via_prob==y[one_te]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment