Created
April 10, 2023 16:36
-
-
Save wphicks/02800752bc16311b863681c25369d45d to your computer and use it in GitHub Desktop.
Comparison of perf for various forest inference implementations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cupy as cp | |
import os | |
import numpy as np | |
import treelite | |
import treelite_runtime | |
import xgboost as xgb | |
from time import perf_counter | |
from cuml.common.device_selection import using_device_type | |
from cuml.experimental import ForestInference | |
def find_optimal_fil_params(model_path, iterations, batch_size): | |
fil_model = ForestInference.load(model_path) | |
optimal_chunk_size = 1 | |
optimal_timing = float('inf') | |
optimization_data = np.random.uniform(0, 1, (iterations, batch_size, 792)) | |
for log_chunk_size in range(6): | |
chunk_size = 2 ** log_chunk_size | |
print(chunk_size) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = optimization_data[iter_index] | |
y = fil_model.predict(X, chunk_size=chunk_size) | |
end = perf_counter() | |
elapsed = end - start | |
if elapsed < optimal_timing: | |
optimal_timing = elapsed | |
optimal_chunk_size = chunk_size | |
return fil_model, optimal_chunk_size | |
def test_predicts(batch_size, iterations): | |
results = {} | |
model_path = '0.model' | |
data = np.random.uniform(0, 1, (iterations, batch_size, 792)) | |
data_gpu = cp.asarray(data) | |
tree_model = xgb.Booster() | |
tree_model.load_model(model_path) | |
tree_model.set_param({'predictor': 'gpu_predictor'}) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = xgb.DMatrix(data[iter_index]) | |
y = tree_model.predict(X) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
print('xgboost GPU time: ', average_time) | |
results['XGB_GPU'] = average_time | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = xgb.DMatrix(data_gpu[iter_index]) | |
y = tree_model.predict(X) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
print('xgboost GPU (Native I/O) time: ', average_time) | |
results['XGB_GPU_native'] = average_time | |
tree_model.set_param({'predictor': 'cpu_predictor'}) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = xgb.DMatrix(data[iter_index]) | |
y = tree_model.predict(X) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
print('xgboost CPU time: ', average_time) | |
results['XGB_CPU'] = average_time | |
tl_model = treelite.Model.from_xgboost(tree_model) | |
compiled_lib_name = '0.so' | |
if not os.path.exists(compiled_lib_name): | |
tl_model.export_lib( | |
toolchain="gcc", | |
libpath=compiled_lib_name, | |
params={"parallel_comp": 40}, | |
verbose=False, | |
) | |
# WARNING: Compiled Treelite models currently have an odd interaction | |
# with CPU FIL. Loading a compiled model in the same process as CPU FIL | |
# is used causes CPU FIL to use only one thread. To get correct results for | |
# CPU FIL, run the Treelite compiled benchmark separately from the | |
# other benchmarks. | |
# | |
# compiled_model = treelite_runtime.Predictor( | |
# compiled_lib_name | |
# ) | |
# start = perf_counter() | |
# for iter_index in range(iterations): | |
# X = treelite_runtime.DMatrix(data[iter_index]) | |
# y = compiled_model.predict(X) | |
# end = perf_counter() | |
# average_time = (end - start) / iterations | |
# results['TL_compiled'] = average_time | |
# print('TL compiled time: ', average_time) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = data[iter_index] | |
y = treelite.gtil.predict(tl_model, X) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
results['GTIL'] = average_time | |
print('GTIL time: ', average_time) | |
with using_device_type('cpu'): | |
# Do not have to find optimal chunk size to use FIL, but doing so can | |
# squeeze some extra performance out of your model | |
fil_model, chunk_size = find_optimal_fil_params( | |
model_path, iterations, batch_size | |
) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = data[iter_index] | |
y = fil_model.predict(X, chunk_size=chunk_size) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
results['FIL_CPU'] = average_time | |
print('FIL CPU time: ', average_time) | |
with using_device_type('gpu'): | |
fil_model, chunk_size = find_optimal_fil_params( | |
model_path, iterations, batch_size | |
) | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = data[iter_index] | |
y = fil_model.predict(X, chunk_size=chunk_size) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
results['FIL_GPU'] = average_time | |
print('FIL GPU time: ', average_time) | |
with using_device_type('gpu'): | |
start = perf_counter() | |
for iter_index in range(iterations): | |
X = data_gpu[iter_index] | |
y = fil_model.predict(X, chunk_size=chunk_size) | |
end = perf_counter() | |
average_time = (end - start) / iterations | |
results['FIL_GPU_native'] = average_time | |
print('FIL GPU (native I/O) time: ', average_time) | |
return results | |
if __name__ == "__main__": | |
all_results = {} | |
iterations = 10 | |
batch_sizes = [1, 10, 1_000, 10_000] | |
for batch_size_ in batch_sizes: | |
print(f'Benchmarking batch size {batch_size_}') | |
all_results[batch_size_] = test_predicts( | |
batch_size_, iterations | |
) | |
algo_names = sorted(all_results[1].keys()) | |
print(",".join(('batch_size', *algo_names))) | |
for batch_size_ in batch_sizes: | |
results = [ | |
str(all_results[batch_size_][algo_name_]) | |
for algo_name_ in algo_names | |
] | |
print(",".join((str(batch_size_), *results))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment