Last active
June 10, 2016 12:03
-
-
Save hughperkins/0f557169d5e0b0417d3dc89defe07011 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from lib/THC/THCTensorSort.cu: | |
// needs following tmeplate variables defined: | |
// Dim integer | |
// IndexType string, eg 'int', or 'unsigned int' | |
// dims list containing Dim | |
// MAX_CLTORCH_DIMS integer, eg 25 | |
// WarpSize integer eg 32 | |
// this needs the following template variables defined: | |
// IndexType string, eg 'int' | |
// MAX_CLTORCH_DIMS integer, eg 25 | |
// dims list of integers, ie all dimensions >=0 this should work for | |
// WarpSize integer eg 32 | |
// defiscontiguous [1|0] (or just dont define, means 0) | |
// defreduceblock [1|0] (or just dont define, means 0) | |
// kernel argument that defines tensor layout | |
typedef struct TensorInfoCl { | |
// Extracts size/stride information for the kernel. | |
// Successive dimensions can be collapsed if the size/strides match | |
// up and thus there are no holes between the dimensions. This is used | |
// to reduce the complexity of the problem. | |
// The optional `reduceDim` indicates a reduction dimension for the | |
// given tensor, so that the output size for this dimension will be 1. | |
unsigned int sizes[1]; | |
unsigned int strides[1]; | |
unsigned int offset; | |
int dims; | |
} TensorInfoCl; | |
// Contiguous tensors of more than one dimension are collapsed down | |
// to one tensor | |
// Translate a linear index for the apply to a float* offset; | |
// specialized on `Dims` to reduce nvcc compilation time | |
static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) { | |
unsigned int offset = info->offset; | |
// Use static dims | |
// for (int i = 2 - 1; i >= 0; --i) { | |
unsigned int curDimIndex; | |
unsigned int curDimOffset; | |
// // bake this in.... | |
// curDimIndex = linearId % info->sizes[1]; | |
// curDimOffset = curDimIndex * info->strides[1]; | |
// offset += curDimOffset; | |
// | |
// linearId /= info->sizes[1]; | |
// bake this in.... | |
curDimIndex = linearId % info->sizes[0]; | |
curDimOffset = curDimIndex * info->strides[0]; | |
offset += curDimOffset; | |
// } | |
return offset; | |
} | |
static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
return linearId + info->offset; | |
} | |
static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
unsigned int offset = info->offset; | |
// Use dynamic dims | |
for (int i = info->dims - 1; i >= 0; --i) { | |
unsigned int curDimIndex = linearId % info->sizes[i]; | |
unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
offset += curDimOffset; | |
linearId /= info->sizes[i]; | |
} | |
return offset; | |
} | |
static inline unsigned int getLinearBlockId() { | |
return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
get_group_id(1) * get_num_groups(0) + | |
get_group_id(0); | |
} | |
// Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
// return the reduced value | |
// `base` is the base address of a tensor | |
// For each slice (defined as a linear point of `out`, from 0 -> | |
// (sliceSize - 1) * sliceStride, we fill that slice from `0` to | |
// `sliceSize - 1`. | |
kernel void | |
fillSliceWithIndex(global TensorInfoCl *out_info, global float *out_data, | |
unsigned int totalSlices, | |
unsigned int sliceSize, | |
unsigned int sliceStride) { | |
unsigned int slice = getLinearBlockId(); | |
if (slice >= totalSlices) { | |
return; | |
} | |
const unsigned long offset = | |
IndexToOffset_1001_get(slice, &out_info[0]); | |
global float* base = &out_data[offset]; | |
for (long i = get_local_id(0); i < sliceSize; i += get_local_size(0)) { | |
// Torch indices are 1-based (hence the +1) | |
base[i * sliceStride] = (float) i + 1.0f; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// from lib/THC/THCSortUtils.cuh: | |
// This needs the following template variables: | |
// K key type | |
// V value type | |
// COMPARE_OP a comparison operator, like < or > | |
// KeyDims integer | |
// ValueDims integer | |
// Power2SortSize integer | |
// dims list of KeyDims and ValueDims | |
// you need to somewhere include before this, with appropriate dims, to include | |
// KeyDims and ValueDims | |
// this needs the following template variables defined: | |
// IndexType string, eg 'int' | |
// MAX_CLTORCH_DIMS integer, eg 25 | |
// dims list of integers, ie all dimensions >=0 this should work for | |
// WarpSize integer eg 32 | |
// defiscontiguous [1|0] (or just dont define, means 0) | |
// defreduceblock [1|0] (or just dont define, means 0) | |
// kernel argument that defines tensor layout | |
typedef struct TensorInfoCl { | |
// Extracts size/stride information for the kernel. | |
// Successive dimensions can be collapsed if the size/strides match | |
// up and thus there are no holes between the dimensions. This is used | |
// to reduce the complexity of the problem. | |
// The optional `reduceDim` indicates a reduction dimension for the | |
// given tensor, so that the output size for this dimension will be 1. | |
unsigned int sizes[1]; | |
unsigned int strides[1]; | |
unsigned int offset; | |
int dims; | |
} TensorInfoCl; | |
// Contiguous tensors of more than one dimension are collapsed down | |
// to one tensor | |
// Translate a linear index for the apply to a float* offset; | |
// specialized on `Dims` to reduce nvcc compilation time | |
static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) { | |
unsigned int offset = info->offset; | |
// Use static dims | |
// for (int i = 2 - 1; i >= 0; --i) { | |
unsigned int curDimIndex; | |
unsigned int curDimOffset; | |
// bake this in.... | |
// curDimIndex = linearId % info->sizes[1]; | |
// curDimOffset = curDimIndex * info->strides[1]; | |
// offset += curDimOffset; | |
// | |
// linearId /= info->sizes[1]; | |
// bake this in.... | |
curDimIndex = linearId % info->sizes[0]; | |
curDimOffset = curDimIndex * info->strides[0]; | |
offset += curDimOffset; | |
// } | |
return offset; | |
} | |
static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) { | |
return linearId + info->offset; | |
} | |
static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) { | |
unsigned int offset = info->offset; | |
// Use dynamic dims | |
for (int i = info->dims - 1; i >= 0; --i) { | |
unsigned int curDimIndex = linearId % info->sizes[i]; | |
unsigned int curDimOffset = curDimIndex * info->strides[i]; | |
offset += curDimOffset; | |
linearId /= info->sizes[i]; | |
} | |
return offset; | |
} | |
static inline unsigned int getLinearBlockId() { | |
return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + | |
get_group_id(1) * get_num_groups(0) + | |
get_group_id(0); | |
} | |
// Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will | |
// return the reduced value | |
static inline void swapVars_K(local float *p_t1, local float*p_t2) { | |
float tmp = *p_t1; | |
*p_t1 = *p_t2; | |
*p_t2 = tmp; | |
} | |
static inline void swapVars_V(local float *p_t1, local float*p_t2) { | |
float tmp = *p_t1; | |
*p_t1 = *p_t2; | |
*p_t2 = tmp; | |
} | |
static inline void swapVars_int(local int *p_t1, local int *p_t2) { | |
int tmp = *p_t1; | |
*p_t1 = *p_t2; | |
*p_t2 = tmp; | |
} | |
static inline void bitonicSwap(local float* p_kA, local float*p_vA, local int*p_validA, | |
local float* p_kB, local float*p_vB, local int*p_validB, | |
int dir) { | |
// Invalid entries always sort to the end | |
// original cuda version was: | |
// int swap = (comp(kA, kB) && validA) || !validB; | |
int swap = (((*p_kA) < (*p_kB)) && (*p_validA)) || !(*p_validB); | |
if (swap == dir) { | |
swapVars_K(p_kA, p_kB); | |
swapVars_V(p_vA, p_vB); | |
swapVars_int(p_validA, p_validB); | |
} | |
}; | |
static inline void bitonicSort(int power2size, local float *p_keys, | |
local float *p_values, | |
local int *p_valid) { | |
#pragma unroll | |
for (unsigned int size = 2; size < power2size; size *= 2) { | |
int flag = ((get_local_id(0) & (size / 2)) != 0); | |
#pragma unroll | |
for (unsigned int stride = size / 2; stride > 0; stride /= 2) { | |
// Single warp per slice is completely synchronous | |
// if (128 > 32) { // is 64 ok? Let's try 32 till it is working ok... | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// } | |
unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); | |
bitonicSwap( | |
p_keys + pos, p_values + pos, p_valid + pos, | |
p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride, | |
flag); | |
} | |
} | |
#pragma unroll | |
for (unsigned int stride = power2size / 2; stride > 0; stride /= 2) { | |
// Single warp per slice is completely synchronous | |
// if (128 > 32) { // note: was 64 before | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// } | |
unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1)); | |
bitonicSwap( | |
p_keys + pos, p_values + pos, p_valid + pos, | |
p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride, | |
false); | |
} | |
// Single warp per slice is completely synchronous | |
// if (128 > 32) { // note: was 64 before | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// } | |
} | |
// Sorts (key, value) pairs (in different tensors) in-place; i.e., | |
// modifies the input `keys` and `values` | |
kernel void | |
bitonicSortKVInPlace(global TensorInfoCl *keys_info, global float *keys_data, | |
unsigned int keySlices, | |
unsigned int keySliceSize, | |
unsigned int keySliceStride, | |
int power2size, | |
global TensorInfoCl *values_info, global float *values_data, | |
unsigned int valueSliceStride, | |
local float *p_sharedKeys, | |
local float *p_sharedValues, | |
local int *p_sharedValid | |
) { | |
int size0 = keys_info->sizes[0]; | |
int stride0 = keys_info->strides[0]; | |
int dims = keys_info->dims; | |
// Find the slice of the tensor that we are sorting | |
const unsigned int linearIndex = getLinearBlockId(); | |
// Tiling the slices could have us be out of bounds, if there are a | |
// lot of slices to sort | |
if (linearIndex >= keySlices) { | |
return; | |
} | |
// local float sharedKeys[128]; | |
// local float sharedValues[128]; | |
// local int sharedValid[128]; | |
const unsigned int keyStartOffset = | |
IndexToOffset_1001_get(linearIndex, &keys_info[0]); | |
const unsigned int valueStartOffset = | |
IndexToOffset_999_get(linearIndex, &values_info[0]); | |
// If the sort size is 1, the data is already sorted | |
if (power2size == 1) { | |
return; | |
} else { | |
// Otherwise, each thread is responsible for loading and storing 2 | |
// elements. The sort size is guaranteed to be >= 2 | |
const int elem1 = get_local_id(0); | |
const int elem2 = get_local_id(0) + (power2size >> 1); | |
int valid1 = (elem1 < keySliceSize); | |
float k1 = valid1 ? | |
keys_data[keyStartOffset + elem1 * keySliceStride] : (float) 0; | |
float v1 = valid1 ? | |
values_data[valueStartOffset + elem1 * valueSliceStride] : (float) 0; | |
p_sharedKeys[elem1] = k1; | |
p_sharedValues[elem1] = v1; | |
p_sharedValid[elem1] = valid1; | |
int valid2 = (elem2 < keySliceSize); | |
float k2 = valid2 ? | |
keys_data[keyStartOffset + elem2 * keySliceStride] : (float) 0; | |
float v2 = valid2 ? | |
values_data[valueStartOffset + elem2 * valueSliceStride] : (float) 0; | |
p_sharedKeys[elem2] = k2; | |
p_sharedValues[elem2] = v2; | |
p_sharedValid[elem2] = valid2; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// Sort! | |
// if(get_local_id(0) == 0) { | |
bitonicSort(power2size, p_sharedKeys, p_sharedValues, p_sharedValid); | |
// } | |
//// if(get_local_id(0) == 0) { | |
// keys_data[0] = sharedKeys[0]; | |
// keys_data[1] = sharedKeys[1]; | |
//// keys_data[0] = elem1; | |
//// keys_data[1] = elem2; | |
//// values_data[0] = 128; | |
// values_data[0] = sharedValues[0]; | |
// values_data[1] = sharedValues[1]; | |
//// } | |
// elem1 values are always valid, since otherwise we would have | |
// chosen the next smallest power-of-2 for sorting | |
keys_data[keyStartOffset + elem1 * keySliceStride] = | |
p_sharedKeys[elem1]; | |
values_data[valueStartOffset + elem1 * valueSliceStride] = | |
p_sharedValues[elem1]; | |
if (valid2) { | |
// elem2 values might be out-of-range, if the data size we are | |
// sorting is not a power-of-2 | |
keys_data[keyStartOffset + elem2 * keySliceStride] = | |
p_sharedKeys[elem2]; | |
values_data[valueStartOffset + elem2 * valueSliceStride] = | |
p_sharedValues[elem2]; | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import numpy as np | |
import pyopencl as cl | |
import struct | |
#div = int(1e2) | |
#N = 1024 | |
#its = 3 | |
N = 3 | |
power2 = 4 | |
gpu_idx = 0 | |
platforms = cl.get_platforms() | |
i = 0 | |
for platform in platforms: | |
gpu_devices = platform.get_devices(device_type=cl.device_type.GPU) | |
if gpu_idx < i + len(gpu_devices): | |
ctx = cl.Context(devices=[gpu_devices[gpu_idx-i]]) | |
break | |
i += len(gpu_devices) | |
print('context', ctx) | |
#ctx = cl.create_some_context() | |
q = cl.CommandQueue(ctx) | |
mf = cl.mem_flags | |
with open('k1.cl', 'r') as f: | |
k1 = f.read() | |
with open('k2.cl', 'r') as f: | |
k2 = f.read() | |
k1_prog = cl.Program(ctx, k1).build() | |
k2_prog = cl.Program(ctx, k2).build() | |
k1_fn = k1_prog.fillSliceWithIndex | |
k2_fn = k2_prog.bitonicSortKVInPlace | |
for it in range(2): | |
a = np.random.rand(N).astype(np.float32) | |
b = np.random.rand(N).astype(np.float32) | |
a.fill(0) | |
b.fill(0) | |
b[0] = 3 | |
b[1] = 2 | |
b[2] = 7 | |
a_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) | |
b_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b) | |
a_info = struct.pack('iiii', N, 1, 0, 1) | |
print('a_info', a_info, 'len', len(a_info)) | |
for i in range(len(a_info)): | |
print(i, a_info[i]) | |
a_info_buf = cl.Buffer(ctx, mf.READ_ONLY, len(a_info)) | |
cl.enqueue_write_buffer(q, a_info_buf, a_info).wait() | |
q.finish() | |
grid = (1,) | |
block = (N,) | |
globalsize = (grid[0]*block[0],) | |
print('grid', grid, 'block', block, 'globalsize', globalsize) | |
print('calling k1...') | |
k1_fn(q, globalsize, block, a_info_buf, a_gpu, np.int32(1), np.int32(N), np.int32(1)) | |
q.finish() | |
cl.enqueue_copy(q, a, a_gpu) | |
print('a', a) | |
#sys.exit(1) | |
grid = (1,) | |
block = (power2//2,) | |
globalsize = (grid[0]*block[0],) | |
print('grid', grid, 'block', block, 'globalsize', globalsize) | |
print('calling k2...') | |
k2_fn(q, globalsize, block, | |
a_info_buf, b_gpu, | |
np.int32(1), np.int32(N), np.int32(1), | |
np.int32(power2), | |
a_info_buf, a_gpu, | |
np.int32(1), | |
cl.LocalMemory(power2 * 4), | |
cl.LocalMemory(power2 * 4), | |
cl.LocalMemory(power2 * 4) | |
) | |
q.finish() | |
cl.enqueue_copy(q, a, a_gpu) | |
cl.enqueue_copy(q, b, b_gpu) | |
q.finish() | |
print('a', a) | |
print('b', b) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment