Last active
August 29, 2015 14:21
-
-
Save tokoroten/629bf0a70d1a53d585cc to your computer and use it in GitHub Desktop.
scikit-learn randomforest serialize problem
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import os.path | |
import shutil | |
import random | |
import math | |
import cPickle | |
import pickle | |
import sklearn.ensemble | |
import sklearn.externals.joblib | |
import sklearn.tree | |
def dump_pickle(model, base_path): | |
path = base_path + "/model.pkl" | |
fp = open(path, "wb") | |
pickle.dump(model, fp) | |
size = os.path.getsize(path) | |
return size | |
def dump_cPickle(model, base_path): | |
path = base_path + "/model.pkl" | |
fp = open(path, "wb") | |
cPickle.dump(model, fp) | |
size = os.path.getsize(path) | |
return size | |
def dump_joblib(model, base_path): | |
joblib_path = base_path + "/model.pkl" | |
sklearn.externals.joblib.dump(model, joblib_path) | |
size = 0 | |
for filename in os.listdir(base_path): | |
size += os.path.getsize(base_path + '/' + filename) | |
return size | |
def dump_dot(model, base_path): | |
estimators = model.estimators_ | |
file_list = [] | |
for i in xrange(len(estimators)): | |
filename = base_path + "/tree_%d.dot" % i | |
file_list.append(filename) | |
sklearn.tree.export_graphviz( | |
estimators[i], | |
filename, | |
['x']) | |
size = 0 | |
for p in file_list: | |
size += os.path.getsize(p) | |
return size | |
def generate_data(sample_num = 10000, random_seed = None): | |
rnd = random.Random(random_seed) | |
label = [] | |
value = [] | |
for i in xrange(sample_num): | |
x = rnd.random() * math.pi * 2 | |
y = math.sin(x) | |
value.append([x]) | |
label.append(y) | |
return label, value | |
def get_trained_model(): | |
model = sklearn.ensemble.RandomForestRegressor( | |
n_estimators = 1000, | |
n_jobs = -1, | |
max_depth=3, | |
bootstrap = True, | |
) | |
label, value = generate_data(100000, 1) | |
model.fit(value, label) | |
return model | |
def model_test(model): | |
label, value = generate_data(100, 2) | |
predicted_labels = model.predict(value) | |
score = sum(abs(label - predicted_labels)) | |
print "score =", score | |
def reset_folder(): | |
folder_list = [ | |
'pickle_1', | |
'pickle_2', | |
'cpickle_1', | |
'cpickle_2', | |
'joblib_1', | |
'joblib_2', | |
'dot_1', | |
'dot_2', | |
] | |
for folder in folder_list: | |
try: | |
shutil.rmtree(folder) | |
except: | |
pass | |
os.mkdir(folder) | |
def main(): | |
reset_folder() | |
model = get_trained_model() | |
model_test(model) | |
print "normal dump size" | |
cpickle_size_1 = dump_cPickle(model, 'cpickle_1') | |
print "dump_cPickle_1=", cpickle_size_1, "%.2fMB" % (cpickle_size_1 / 2.0**20) | |
pickle_size_1 = dump_pickle(model, 'pickle_1') | |
print "dump_pickle_1=", pickle_size_1, "%.2fMB" % (pickle_size_1 / 2.0**20) | |
dump_joblib_1 = dump_joblib(model, 'joblib_1') | |
print "dump_joblib_1=", dump_joblib_1, "%.2fMB" % (dump_joblib_1 / 2.0**20) | |
dump_dot_1 = dump_dot(model, 'dot_1') | |
print "dump_dot_1=", dump_dot_1, "%.2fMB" % (dump_dot_1 / 2.0**20) | |
# delete indices_ | |
for e in model.estimators_: | |
del e.indices_ | |
model_test(model) # it works as not delete indices_ ver | |
print "delete indices_ dump size" | |
cpickle_size_2 = dump_cPickle(model, 'cpickle_2') | |
print "dump_cPickle_2=", cpickle_size_2, "%.2fMB" % (cpickle_size_2 / 2.0**20) | |
pickle_size_2 = dump_pickle(model, 'pickle_2') | |
print "dump_pickle_2=", pickle_size_2, "%.2fMB" % (pickle_size_2 / 2.0**20) | |
dump_joblib_2 = dump_joblib(model, 'joblib_2') | |
print "dump_joblib_2=", dump_joblib_2, "%.2fMB" % (dump_joblib_2 / 2.0**20) | |
dump_dot_2 = dump_dot(model, 'dot_2') | |
print "dump_dot_2=", dump_dot_2, "%.2fMB" % (dump_dot_2 / 2.0**20) | |
print "compress_rate" | |
print "cpickle_compress", "%.2f%%" % (float(cpickle_size_2) / float(cpickle_size_1) * 100) | |
print "pickle_compress", "%.2f%%" % (float(pickle_size_2) / float(pickle_size_1) * 100) | |
print "joblib_compress", "%.2f%%" % (float(dump_joblib_2) / float(dump_joblib_1) * 100) | |
print "dot_compress", "%.2f%%" % (float(dump_dot_2) / float(dump_dot_1) * 100) | |
return model | |
if __name__ == "__main__": | |
model = main() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
score = 9.8706322981 | |
normal dump size | |
dump_cPickle_1= 403963904 385.25MB | |
dump_pickle_1= 404131840 385.41MB | |
dump_joblib_1= 101802336 97.09MB | |
dump_dot_1= 1336226 1.27MB | |
score = 9.8706322981 | |
delete indices_ dump size | |
dump_cPickle_2= 3903488 3.72MB | |
dump_pickle_2= 4034560 3.85MB | |
dump_joblib_2= 1671323 1.59MB | |
dump_dot_2= 1336226 1.27MB | |
compress_rate | |
cpickle_compress 0.97% | |
pickle_compress 1.00% | |
joblib_compress 1.64% | |
dot_compress 100.00% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
problem is here.
https://github.com/scikit-learn/scikit-learn/blob/64e553398b3873de3d9d1e67f0cccb20e01824bc/sklearn/ensemble/forest.py#L96
if bootstrap option is True, it takes member-variable to calculate oob_score.
the memory size is (randomforest_tree_num * len(train_dataset) * sizeof(bool)).
so, train large dataset, it takes large memory,