Skip to content

Instantly share code, notes, and snippets.

@hughperkins
Last active June 10, 2016 12:03
Show Gist options
  • Save hughperkins/0f557169d5e0b0417d3dc89defe07011 to your computer and use it in GitHub Desktop.
Save hughperkins/0f557169d5e0b0417d3dc89defe07011 to your computer and use it in GitHub Desktop.
// from lib/THC/THCTensorSort.cu:
// needs following tmeplate variables defined:
// Dim integer
// IndexType string, eg 'int', or 'unsigned int'
// dims list containing Dim
// MAX_CLTORCH_DIMS integer, eg 25
// WarpSize integer eg 32
// this needs the following template variables defined:
// IndexType string, eg 'int'
// MAX_CLTORCH_DIMS integer, eg 25
// dims list of integers, ie all dimensions >=0 this should work for
// WarpSize integer eg 32
// defiscontiguous [1|0] (or just dont define, means 0)
// defreduceblock [1|0] (or just dont define, means 0)
// kernel argument that defines tensor layout
typedef struct TensorInfoCl {
// Extracts size/stride information for the kernel.
// Successive dimensions can be collapsed if the size/strides match
// up and thus there are no holes between the dimensions. This is used
// to reduce the complexity of the problem.
// The optional `reduceDim` indicates a reduction dimension for the
// given tensor, so that the output size for this dimension will be 1.
unsigned int sizes[1];
unsigned int strides[1];
unsigned int offset;
int dims;
} TensorInfoCl;
// Contiguous tensors of more than one dimension are collapsed down
// to one tensor
// Translate a linear index for the apply to a float* offset;
// specialized on `Dims` to reduce nvcc compilation time
static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
unsigned int offset = info->offset;
// Use static dims
// for (int i = 2 - 1; i >= 0; --i) {
unsigned int curDimIndex;
unsigned int curDimOffset;
// // bake this in....
// curDimIndex = linearId % info->sizes[1];
// curDimOffset = curDimIndex * info->strides[1];
// offset += curDimOffset;
//
// linearId /= info->sizes[1];
// bake this in....
curDimIndex = linearId % info->sizes[0];
curDimOffset = curDimIndex * info->strides[0];
offset += curDimOffset;
// }
return offset;
}
static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
return linearId + info->offset;
}
static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
unsigned int offset = info->offset;
// Use dynamic dims
for (int i = info->dims - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info->sizes[i];
unsigned int curDimOffset = curDimIndex * info->strides[i];
offset += curDimOffset;
linearId /= info->sizes[i];
}
return offset;
}
static inline unsigned int getLinearBlockId() {
return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
get_group_id(1) * get_num_groups(0) +
get_group_id(0);
}
// Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
// return the reduced value
// `base` is the base address of a tensor
// For each slice (defined as a linear point of `out`, from 0 ->
// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
// `sliceSize - 1`.
kernel void
fillSliceWithIndex(global TensorInfoCl *out_info, global float *out_data,
unsigned int totalSlices,
unsigned int sliceSize,
unsigned int sliceStride) {
unsigned int slice = getLinearBlockId();
if (slice >= totalSlices) {
return;
}
const unsigned long offset =
IndexToOffset_1001_get(slice, &out_info[0]);
global float* base = &out_data[offset];
for (long i = get_local_id(0); i < sliceSize; i += get_local_size(0)) {
// Torch indices are 1-based (hence the +1)
base[i * sliceStride] = (float) i + 1.0f;
}
}
// from lib/THC/THCSortUtils.cuh:
// This needs the following template variables:
// K key type
// V value type
// COMPARE_OP a comparison operator, like < or >
// KeyDims integer
// ValueDims integer
// Power2SortSize integer
// dims list of KeyDims and ValueDims
// you need to somewhere include before this, with appropriate dims, to include
// KeyDims and ValueDims
// this needs the following template variables defined:
// IndexType string, eg 'int'
// MAX_CLTORCH_DIMS integer, eg 25
// dims list of integers, ie all dimensions >=0 this should work for
// WarpSize integer eg 32
// defiscontiguous [1|0] (or just dont define, means 0)
// defreduceblock [1|0] (or just dont define, means 0)
// kernel argument that defines tensor layout
typedef struct TensorInfoCl {
// Extracts size/stride information for the kernel.
// Successive dimensions can be collapsed if the size/strides match
// up and thus there are no holes between the dimensions. This is used
// to reduce the complexity of the problem.
// The optional `reduceDim` indicates a reduction dimension for the
// given tensor, so that the output size for this dimension will be 1.
unsigned int sizes[1];
unsigned int strides[1];
unsigned int offset;
int dims;
} TensorInfoCl;
// Contiguous tensors of more than one dimension are collapsed down
// to one tensor
// Translate a linear index for the apply to a float* offset;
// specialized on `Dims` to reduce nvcc compilation time
static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
unsigned int offset = info->offset;
// Use static dims
// for (int i = 2 - 1; i >= 0; --i) {
unsigned int curDimIndex;
unsigned int curDimOffset;
// bake this in....
// curDimIndex = linearId % info->sizes[1];
// curDimOffset = curDimIndex * info->strides[1];
// offset += curDimOffset;
//
// linearId /= info->sizes[1];
// bake this in....
curDimIndex = linearId % info->sizes[0];
curDimOffset = curDimIndex * info->strides[0];
offset += curDimOffset;
// }
return offset;
}
static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
return linearId + info->offset;
}
static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
unsigned int offset = info->offset;
// Use dynamic dims
for (int i = info->dims - 1; i >= 0; --i) {
unsigned int curDimIndex = linearId % info->sizes[i];
unsigned int curDimOffset = curDimIndex * info->strides[i];
offset += curDimOffset;
linearId /= info->sizes[i];
}
return offset;
}
static inline unsigned int getLinearBlockId() {
return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
get_group_id(1) * get_num_groups(0) +
get_group_id(0);
}
// Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
// return the reduced value
static inline void swapVars_K(local float *p_t1, local float*p_t2) {
float tmp = *p_t1;
*p_t1 = *p_t2;
*p_t2 = tmp;
}
static inline void swapVars_V(local float *p_t1, local float*p_t2) {
float tmp = *p_t1;
*p_t1 = *p_t2;
*p_t2 = tmp;
}
static inline void swapVars_int(local int *p_t1, local int *p_t2) {
int tmp = *p_t1;
*p_t1 = *p_t2;
*p_t2 = tmp;
}
static inline void bitonicSwap(local float* p_kA, local float*p_vA, local int*p_validA,
local float* p_kB, local float*p_vB, local int*p_validB,
int dir) {
// Invalid entries always sort to the end
// original cuda version was:
// int swap = (comp(kA, kB) && validA) || !validB;
int swap = (((*p_kA) < (*p_kB)) && (*p_validA)) || !(*p_validB);
if (swap == dir) {
swapVars_K(p_kA, p_kB);
swapVars_V(p_vA, p_vB);
swapVars_int(p_validA, p_validB);
}
};
static inline void bitonicSort(int power2size, local float *p_keys,
local float *p_values,
local int *p_valid) {
#pragma unroll
for (unsigned int size = 2; size < power2size; size *= 2) {
int flag = ((get_local_id(0) & (size / 2)) != 0);
#pragma unroll
for (unsigned int stride = size / 2; stride > 0; stride /= 2) {
// Single warp per slice is completely synchronous
// if (128 > 32) { // is 64 ok? Let's try 32 till it is working ok...
barrier(CLK_LOCAL_MEM_FENCE);
// }
unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
bitonicSwap(
p_keys + pos, p_values + pos, p_valid + pos,
p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
flag);
}
}
#pragma unroll
for (unsigned int stride = power2size / 2; stride > 0; stride /= 2) {
// Single warp per slice is completely synchronous
// if (128 > 32) { // note: was 64 before
barrier(CLK_LOCAL_MEM_FENCE);
// }
unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
bitonicSwap(
p_keys + pos, p_values + pos, p_valid + pos,
p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
false);
}
// Single warp per slice is completely synchronous
// if (128 > 32) { // note: was 64 before
barrier(CLK_LOCAL_MEM_FENCE);
// }
}
// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
// modifies the input `keys` and `values`
kernel void
bitonicSortKVInPlace(global TensorInfoCl *keys_info, global float *keys_data,
unsigned int keySlices,
unsigned int keySliceSize,
unsigned int keySliceStride,
int power2size,
global TensorInfoCl *values_info, global float *values_data,
unsigned int valueSliceStride,
local float *p_sharedKeys,
local float *p_sharedValues,
local int *p_sharedValid
) {
int size0 = keys_info->sizes[0];
int stride0 = keys_info->strides[0];
int dims = keys_info->dims;
// Find the slice of the tensor that we are sorting
const unsigned int linearIndex = getLinearBlockId();
// Tiling the slices could have us be out of bounds, if there are a
// lot of slices to sort
if (linearIndex >= keySlices) {
return;
}
// local float sharedKeys[128];
// local float sharedValues[128];
// local int sharedValid[128];
const unsigned int keyStartOffset =
IndexToOffset_1001_get(linearIndex, &keys_info[0]);
const unsigned int valueStartOffset =
IndexToOffset_999_get(linearIndex, &values_info[0]);
// If the sort size is 1, the data is already sorted
if (power2size == 1) {
return;
} else {
// Otherwise, each thread is responsible for loading and storing 2
// elements. The sort size is guaranteed to be >= 2
const int elem1 = get_local_id(0);
const int elem2 = get_local_id(0) + (power2size >> 1);
int valid1 = (elem1 < keySliceSize);
float k1 = valid1 ?
keys_data[keyStartOffset + elem1 * keySliceStride] : (float) 0;
float v1 = valid1 ?
values_data[valueStartOffset + elem1 * valueSliceStride] : (float) 0;
p_sharedKeys[elem1] = k1;
p_sharedValues[elem1] = v1;
p_sharedValid[elem1] = valid1;
int valid2 = (elem2 < keySliceSize);
float k2 = valid2 ?
keys_data[keyStartOffset + elem2 * keySliceStride] : (float) 0;
float v2 = valid2 ?
values_data[valueStartOffset + elem2 * valueSliceStride] : (float) 0;
p_sharedKeys[elem2] = k2;
p_sharedValues[elem2] = v2;
p_sharedValid[elem2] = valid2;
barrier(CLK_LOCAL_MEM_FENCE);
// Sort!
// if(get_local_id(0) == 0) {
bitonicSort(power2size, p_sharedKeys, p_sharedValues, p_sharedValid);
// }
//// if(get_local_id(0) == 0) {
// keys_data[0] = sharedKeys[0];
// keys_data[1] = sharedKeys[1];
//// keys_data[0] = elem1;
//// keys_data[1] = elem2;
//// values_data[0] = 128;
// values_data[0] = sharedValues[0];
// values_data[1] = sharedValues[1];
//// }
// elem1 values are always valid, since otherwise we would have
// chosen the next smallest power-of-2 for sorting
keys_data[keyStartOffset + elem1 * keySliceStride] =
p_sharedKeys[elem1];
values_data[valueStartOffset + elem1 * valueSliceStride] =
p_sharedValues[elem1];
if (valid2) {
// elem2 values might be out-of-range, if the data size we are
// sorting is not a power-of-2
keys_data[keyStartOffset + elem2 * keySliceStride] =
p_sharedKeys[elem2];
values_data[valueStartOffset + elem2 * valueSliceStride] =
p_sharedValues[elem2];
}
}
}
import time
import numpy as np
import pyopencl as cl
import struct
#div = int(1e2)
#N = 1024
#its = 3
N = 3
power2 = 4
gpu_idx = 0
platforms = cl.get_platforms()
i = 0
for platform in platforms:
gpu_devices = platform.get_devices(device_type=cl.device_type.GPU)
if gpu_idx < i + len(gpu_devices):
ctx = cl.Context(devices=[gpu_devices[gpu_idx-i]])
break
i += len(gpu_devices)
print('context', ctx)
#ctx = cl.create_some_context()
q = cl.CommandQueue(ctx)
mf = cl.mem_flags
with open('k1.cl', 'r') as f:
k1 = f.read()
with open('k2.cl', 'r') as f:
k2 = f.read()
k1_prog = cl.Program(ctx, k1).build()
k2_prog = cl.Program(ctx, k2).build()
k1_fn = k1_prog.fillSliceWithIndex
k2_fn = k2_prog.bitonicSortKVInPlace
for it in range(2):
a = np.random.rand(N).astype(np.float32)
b = np.random.rand(N).astype(np.float32)
a.fill(0)
b.fill(0)
b[0] = 3
b[1] = 2
b[2] = 7
a_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
b_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)
a_info = struct.pack('iiii', N, 1, 0, 1)
print('a_info', a_info, 'len', len(a_info))
for i in range(len(a_info)):
print(i, a_info[i])
a_info_buf = cl.Buffer(ctx, mf.READ_ONLY, len(a_info))
cl.enqueue_write_buffer(q, a_info_buf, a_info).wait()
q.finish()
grid = (1,)
block = (N,)
globalsize = (grid[0]*block[0],)
print('grid', grid, 'block', block, 'globalsize', globalsize)
print('calling k1...')
k1_fn(q, globalsize, block, a_info_buf, a_gpu, np.int32(1), np.int32(N), np.int32(1))
q.finish()
cl.enqueue_copy(q, a, a_gpu)
print('a', a)
#sys.exit(1)
grid = (1,)
block = (power2//2,)
globalsize = (grid[0]*block[0],)
print('grid', grid, 'block', block, 'globalsize', globalsize)
print('calling k2...')
k2_fn(q, globalsize, block,
a_info_buf, b_gpu,
np.int32(1), np.int32(N), np.int32(1),
np.int32(power2),
a_info_buf, a_gpu,
np.int32(1),
cl.LocalMemory(power2 * 4),
cl.LocalMemory(power2 * 4),
cl.LocalMemory(power2 * 4)
)
q.finish()
cl.enqueue_copy(q, a, a_gpu)
cl.enqueue_copy(q, b, b_gpu)
q.finish()
print('a', a)
print('b', b)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment