andfoy · November 8, 2022 23:37
diff --git a/profile_unique_cupy.py b/profile_unique_cupy.py

 import math
 import tqdm
 import cupy as cp
 import numpy as np

 from cupy import testing
 from cupyx.profiler import benchmark


 dtypes = [cp.uint8, cp.int32, cp.int64, cp.float32, cp.float64,
          cp.complex64, cp.complex128]
 sizes = [(x, x) for x in [3, 5, 10, 100, 500, 1000, 5000, 10000]]
 times = {
    'gpu': {
        d: {
            s: {
                'cpu_mean': None,
                'cpu_std': None,
                'gpu_mean': None,
                'gpu_std': None
            }
            for s in sizes
        }
        for d in dtypes
    },
    'cpu': {
        d: {
            s: {
                'mean': None,
                'std': None,
                'cpu_mean': None,
                'cpu_std': None,
                'gpu_mean': None,
                'gpu_std': None
            }
            for s in sizes
        }
        for d in dtypes
    },
 }

 def gather_time(prof):
    cpu_time = prof.cpu_times.mean() * 1000
    gpu_time = prof.gpu_times.mean() * 1000
    cpu_std = prof.cpu_times.std() * 1000
    gpu_std = prof.gpu_times.std() * 1000
    return {
        'cpu_mean': cpu_time,
        'cpu_std': cpu_std,
        'gpu_mean': gpu_time,
        'gpu_std': gpu_std
    }


 def call_cpu(ar, axis=0):
    ar = cp.moveaxis(ar, axis, 0)
    orig_shape = ar.shape
    ar_cpu = ar.reshape(orig_shape[0], math.prod(orig_shape[1:]))
    ar_cpu = cp.asnumpy(cp.ascontiguousarray(ar_cpu))
    _, sorted_indices = np.unique(ar_cpu, return_index=True, axis=0)
    ar = cp.take(ar, sorted_indices, 0)
    ar = ar.reshape(sorted_indices.shape[0], *orig_shape[1:])
    return cp.moveaxis(ar, 0, axis)

 funcs = {
    'gpu': cp.unique,
    'cpu': call_cpu
 }

 headers = {
    'gpu': 'CuPy-only',
    'cpu': 'NumPy call'
 }

 for dtype in dtypes:
    print(dtype)
    for size in sizes:
        x = testing.shaped_random(size, dtype=dtype)
        for comp_id in tqdm.tqdm(funcs):
            func = funcs[comp_id]
            prof = benchmark(func, (x,), n_repeat=100)
            time_results = gather_time(prof)
            times[comp_id][dtype][size] = time_results


 lines = []
 # for kind in funcs:
 #     header = headers[kind]
 #     lines.append(f'## {header}\n')
 #     kind_times = times[kind]
 #     # lines.append('| Size | `dtype` | CPU time (ms) | GPU time (ms) |')
 #     # lines.append('|:----:|:-------:|:-------------:|:-------------:|')
 #     lines.append('| Size | `dtype` | max(CPU, GPU) time (ms) |')
 #     lines.append('|:----:|:-------:|:-------------:|')
 #     for dtype in dtypes:
 #         dtype_times = kind_times[dtype]
 #         dtype_name = dtype.__name__
 #         for size in sizes:
 #             size_str = 'x'.join([str(i) for i in size])
 #             size_times = dtype_times[size]
 #             cpu_time = size_times['cpu_mean']
 #             gpu_time = size_times['gpu_mean']
 #             if cpu_time is not None:
 #                 # lines.append(
 #                 #     f'| {size_str} | `{dtype_name}` | {cpu_time:3f} '
 #                 #     f'| {gpu_time:3f} |')
 #                 max_time = max(cpu_time, gpu_time)
 #                 lines.append(
 #                     f'| {size_str} | `{dtype_name}` | {max_time:3f} ')
 #     lines.append('\n')

 lines.append(f'| Size | `dtype` | {headers["gpu"]} (ms) | {headers["cpu"]} (ms) |')
 lines.append('|:----:|:-------:|:-------------:|:-------------:|')

 for dtype in dtypes:
    dtype_name = dtype.__name__
    for size in sizes:
        size_str = 'x'.join([str(i) for i in size])
        line = f'| {size_str} | `{dtype_name}`'
        comp = 2
        for kind in funcs:
            kind_times = times[kind]
            dtype_times = kind_times[dtype]
            size_times = dtype_times[size]
            cpu_time = size_times['cpu_mean']
            gpu_time = size_times['gpu_mean']
            if cpu_time is not None:
                max_time = max(cpu_time, gpu_time)
                line = f'{line} | {max_time:3f}'
                comp += 1
        if comp == 4:
            lines.append(line)

 print('\n'.join(lines))

	import math
	import tqdm
	import cupy as cp
	import numpy as np

	from cupy import testing
	from cupyx.profiler import benchmark


	dtypes = [cp.uint8, cp.int32, cp.int64, cp.float32, cp.float64,
	cp.complex64, cp.complex128]
	sizes = [(x, x) for x in [3, 5, 10, 100, 500, 1000, 5000, 10000]]
	times = {
	'gpu': {
	d: {
	s: {
	'cpu_mean': None,
	'cpu_std': None,
	'gpu_mean': None,
	'gpu_std': None
	}
	for s in sizes
	}
	for d in dtypes
	},
	'cpu': {
	d: {
	s: {
	'mean': None,
	'std': None,
	'cpu_mean': None,
	'cpu_std': None,
	'gpu_mean': None,
	'gpu_std': None
	}
	for s in sizes
	}
	for d in dtypes
	},
	}

	def gather_time(prof):
	cpu_time = prof.cpu_times.mean() * 1000
	gpu_time = prof.gpu_times.mean() * 1000
	cpu_std = prof.cpu_times.std() * 1000
	gpu_std = prof.gpu_times.std() * 1000
	return {
	'cpu_mean': cpu_time,
	'cpu_std': cpu_std,
	'gpu_mean': gpu_time,
	'gpu_std': gpu_std
	}


	def call_cpu(ar, axis=0):
	ar = cp.moveaxis(ar, axis, 0)
	orig_shape = ar.shape
	ar_cpu = ar.reshape(orig_shape[0], math.prod(orig_shape[1:]))
	ar_cpu = cp.asnumpy(cp.ascontiguousarray(ar_cpu))
	_, sorted_indices = np.unique(ar_cpu, return_index=True, axis=0)
	ar = cp.take(ar, sorted_indices, 0)
	ar = ar.reshape(sorted_indices.shape[0], *orig_shape[1:])
	return cp.moveaxis(ar, 0, axis)

	funcs = {
	'gpu': cp.unique,
	'cpu': call_cpu
	}

	headers = {
	'gpu': 'CuPy-only',
	'cpu': 'NumPy call'
	}

	for dtype in dtypes:
	print(dtype)
	for size in sizes:
	x = testing.shaped_random(size, dtype=dtype)
	for comp_id in tqdm.tqdm(funcs):
	func = funcs[comp_id]
	prof = benchmark(func, (x,), n_repeat=100)
	time_results = gather_time(prof)
	times[comp_id][dtype][size] = time_results


	lines = []
	# for kind in funcs:
	# header = headers[kind]
	# lines.append(f'## {header}\n')
	# kind_times = times[kind]
	# # lines.append('\| Size \| `dtype` \| CPU time (ms) \| GPU time (ms) \|')
	# # lines.append('\|:----:\|:-------:\|:-------------:\|:-------------:\|')
	# lines.append('\| Size \| `dtype` \| max(CPU, GPU) time (ms) \|')
	# lines.append('\|:----:\|:-------:\|:-------------:\|')
	# for dtype in dtypes:
	# dtype_times = kind_times[dtype]
	# dtype_name = dtype.__name__
	# for size in sizes:
	# size_str = 'x'.join([str(i) for i in size])
	# size_times = dtype_times[size]
	# cpu_time = size_times['cpu_mean']
	# gpu_time = size_times['gpu_mean']
	# if cpu_time is not None:
	# # lines.append(
	# # f'\| {size_str} \| `{dtype_name}` \| {cpu_time:3f} '
	# # f'\| {gpu_time:3f} \|')
	# max_time = max(cpu_time, gpu_time)
	# lines.append(
	# f'\| {size_str} \| `{dtype_name}` \| {max_time:3f} ')
	# lines.append('\n')

	lines.append(f'\| Size \| `dtype` \| {headers["gpu"]} (ms) \| {headers["cpu"]} (ms) \|')
	lines.append('\|:----:\|:-------:\|:-------------:\|:-------------:\|')

	for dtype in dtypes:
	dtype_name = dtype.__name__
	for size in sizes:
	size_str = 'x'.join([str(i) for i in size])
	line = f'\| {size_str} \| `{dtype_name}`'
	comp = 2
	for kind in funcs:
	kind_times = times[kind]
	dtype_times = kind_times[dtype]
	size_times = dtype_times[size]
	cpu_time = size_times['cpu_mean']
	gpu_time = size_times['gpu_mean']
	if cpu_time is not None:
	max_time = max(cpu_time, gpu_time)
	line = f'{line} \| {max_time:3f}'
	comp += 1
	if comp == 4:
	lines.append(line)

	print('\n'.join(lines))