hughperkins · June 10, 2016 12:03
diff --git a/k1.cl b/k1.cl
 // from lib/THC/THCTensorSort.cu:

 // needs following tmeplate variables defined:
 //   Dim        integer
 //   IndexType  string, eg 'int', or 'unsigned int'
 //   dims       list containing Dim
 //   MAX_CLTORCH_DIMS    integer, eg 25
 //   WarpSize            integer eg 32

 // this needs the following template variables defined:
 //   IndexType   string, eg 'int'
 //   MAX_CLTORCH_DIMS    integer, eg 25
 //   dims                list of integers, ie all dimensions >=0 this should work for
 //   WarpSize            integer eg 32
 //   defiscontiguous     [1|0]  (or just dont define, means 0)
 //   defreduceblock      [1|0]  (or just dont define, means 0)


 // kernel argument that defines tensor layout
 typedef struct TensorInfoCl {
  // Extracts size/stride information for the kernel.
  // Successive dimensions can be collapsed if the size/strides match
  // up and thus there are no holes between the dimensions. This is used
  // to reduce the complexity of the problem.
  // The optional `reduceDim` indicates a reduction dimension for the
  // given tensor, so that the output size for this dimension will be 1.

  unsigned int sizes[1];
  unsigned int strides[1];
  unsigned int offset;
  int dims;
 } TensorInfoCl;
 // Contiguous tensors of more than one dimension are collapsed down
 // to one tensor


 // Translate a linear index for the apply to a float* offset;
 // specialized on `Dims` to reduce nvcc compilation time

 static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
  unsigned int offset = info->offset;

  // Use static dims
 //  for (int i = 2 - 1; i >= 0; --i) {
  unsigned int curDimIndex;
  unsigned int curDimOffset;
 //    // bake this in....
 //    curDimIndex = linearId % info->sizes[1];
 //    curDimOffset = curDimIndex * info->strides[1];
 //    offset += curDimOffset;

 //    
 //      linearId /= info->sizes[1];
    
    // bake this in....
    curDimIndex = linearId % info->sizes[0];
    curDimOffset = curDimIndex * info->strides[0];
    offset += curDimOffset;

    
  
 //  }

  return offset;
 }


 static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
    return linearId + info->offset;
 }

 static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
  unsigned int offset = info->offset;

  // Use dynamic dims
  for (int i = info->dims - 1; i >= 0; --i) {
    unsigned int curDimIndex = linearId % info->sizes[i];
    unsigned int curDimOffset = curDimIndex * info->strides[i];
    offset += curDimOffset;

    linearId /= info->sizes[i];
  }

  return offset;
 }

 static inline unsigned int getLinearBlockId() {
  return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
    get_group_id(1) * get_num_groups(0) +
    get_group_id(0);
 }

 // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 // return the reduced value




 // `base` is the base address of a tensor
 // For each slice (defined as a linear point of `out`, from 0 ->
 // (sliceSize - 1) * sliceStride, we fill that slice from `0` to
 // `sliceSize - 1`.
 kernel void
 fillSliceWithIndex(global TensorInfoCl *out_info, global float *out_data,
                   unsigned int totalSlices,
                   unsigned int sliceSize,
                   unsigned int sliceStride) {
  unsigned int slice = getLinearBlockId();

  if (slice >= totalSlices) {
    return;
  }

  const unsigned long offset =
    IndexToOffset_1001_get(slice, &out_info[0]);
  global float* base = &out_data[offset];

  for (long i = get_local_id(0); i < sliceSize; i += get_local_size(0)) {
    // Torch indices are 1-based (hence the +1)
    base[i * sliceStride] = (float) i + 1.0f;
  }
 }
diff --git a/k2.cl b/k2.cl
 // from lib/THC/THCSortUtils.cuh:

 // This needs the following template variables:
 //   K              key type
 //   V              value type
 //   COMPARE_OP     a comparison operator, like <   or >
 //   KeyDims        integer
 //   ValueDims      integer
 //   Power2SortSize  integer
 //   dims           list of KeyDims and ValueDims

 // you need to somewhere include  before this, with appropriate dims, to include
 // KeyDims and ValueDims

 // this needs the following template variables defined:
 //   IndexType   string, eg 'int'
 //   MAX_CLTORCH_DIMS    integer, eg 25
 //   dims                list of integers, ie all dimensions >=0 this should work for
 //   WarpSize            integer eg 32
 //   defiscontiguous     [1|0]  (or just dont define, means 0)
 //   defreduceblock      [1|0]  (or just dont define, means 0)


 // kernel argument that defines tensor layout
 typedef struct TensorInfoCl {
  // Extracts size/stride information for the kernel.
  // Successive dimensions can be collapsed if the size/strides match
  // up and thus there are no holes between the dimensions. This is used
  // to reduce the complexity of the problem.
  // The optional `reduceDim` indicates a reduction dimension for the
  // given tensor, so that the output size for this dimension will be 1.

  unsigned int sizes[1];
  unsigned int strides[1];
  unsigned int offset;
  int dims;
 } TensorInfoCl;
 // Contiguous tensors of more than one dimension are collapsed down
 // to one tensor


 // Translate a linear index for the apply to a float* offset;
 // specialized on `Dims` to reduce nvcc compilation time

 static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
  unsigned int offset = info->offset;

  // Use static dims
 //  for (int i = 2 - 1; i >= 0; --i) {
  unsigned int curDimIndex;
  unsigned int curDimOffset;
    // bake this in....
 //    curDimIndex = linearId % info->sizes[1];
 //    curDimOffset = curDimIndex * info->strides[1];
 //    offset += curDimOffset;

 //    
 //      linearId /= info->sizes[1];
    
    // bake this in....
    curDimIndex = linearId % info->sizes[0];
    curDimOffset = curDimIndex * info->strides[0];
    offset += curDimOffset;

    
  
 //  }

  return offset;
 }


 static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
    return linearId + info->offset;
 }

 static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
  unsigned int offset = info->offset;

  // Use dynamic dims
  for (int i = info->dims - 1; i >= 0; --i) {
    unsigned int curDimIndex = linearId % info->sizes[i];
    unsigned int curDimOffset = curDimIndex * info->strides[i];
    offset += curDimOffset;

    linearId /= info->sizes[i];
  }

  return offset;
 }

 static inline unsigned int getLinearBlockId() {
  return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
    get_group_id(1) * get_num_groups(0) +
    get_group_id(0);
 }

 // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 // return the reduced value




 static inline void swapVars_K(local float *p_t1, local float*p_t2) {
  float tmp = *p_t1;
  *p_t1 = *p_t2;
  *p_t2 = tmp;
 }

 static inline void swapVars_V(local float *p_t1, local float*p_t2) {
  float tmp = *p_t1;
  *p_t1 = *p_t2;
  *p_t2 = tmp;
 }

 static inline void swapVars_int(local int *p_t1, local int *p_t2) {
  int tmp = *p_t1;
  *p_t1 = *p_t2;
  *p_t2 = tmp;
 }

 static inline void bitonicSwap(local float* p_kA, local float*p_vA, local int*p_validA,
                        local float* p_kB, local float*p_vB, local int*p_validB,
                        int dir) {
  // Invalid entries always sort to the end
  // original cuda version was:
  //   int swap = (comp(kA, kB) && validA) || !validB;
  int swap = (((*p_kA) < (*p_kB)) && (*p_validA)) || !(*p_validB);
  if (swap == dir) {
    swapVars_K(p_kA, p_kB);
    swapVars_V(p_vA, p_vB);
    swapVars_int(p_validA, p_validB);
  }
 };

 static inline void bitonicSort(int power2size, local float *p_keys,
                        local float *p_values,
                        local int *p_valid) {
  #pragma unroll
  for (unsigned int size = 2; size < power2size; size *= 2) {
    int flag = ((get_local_id(0) & (size / 2)) != 0);

    #pragma unroll
    for (unsigned int stride = size / 2; stride > 0; stride /= 2) {

      // Single warp per slice is completely synchronous
 //      if (128 > 32) {   // is 64 ok?  Let's try 32 till it is working ok...
        barrier(CLK_LOCAL_MEM_FENCE);
 //      }

      unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
      bitonicSwap(
        p_keys + pos, p_values + pos, p_valid + pos,
        p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
        flag);
    }
  }

  #pragma unroll
  for (unsigned int stride = power2size / 2; stride > 0; stride /= 2) {
    // Single warp per slice is completely synchronous
 //    if (128 > 32) { // note: was 64 before
      barrier(CLK_LOCAL_MEM_FENCE);
 //    }

    unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
    bitonicSwap(
      p_keys + pos, p_values + pos, p_valid + pos,
      p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
      false);
  }

  // Single warp per slice is completely synchronous
 //  if (128 > 32) {  // note: was 64 before
    barrier(CLK_LOCAL_MEM_FENCE);
 //  }
 }

 // Sorts (key, value) pairs (in different tensors) in-place; i.e.,
 // modifies the input `keys` and `values`
 kernel void
 bitonicSortKVInPlace(global TensorInfoCl *keys_info, global float *keys_data,
                     unsigned int keySlices,
                     unsigned int keySliceSize,
                     unsigned int keySliceStride,
                     int power2size,
                     global TensorInfoCl *values_info, global float *values_data,
                     unsigned int valueSliceStride,
                     local float *p_sharedKeys,
                     local float *p_sharedValues,
                     local int *p_sharedValid
 ) {
  int size0 = keys_info->sizes[0];
  int stride0 = keys_info->strides[0];
  int dims = keys_info->dims;

  // Find the slice of the tensor that we are sorting
  const unsigned int linearIndex = getLinearBlockId();
  // Tiling the slices could have us be out of bounds, if there are a
  // lot of slices to sort
  if (linearIndex >= keySlices) {
    return;
  }
  
 //  local float sharedKeys[128];
 //  local float sharedValues[128];
 //  local int sharedValid[128];

  const unsigned int keyStartOffset =
    IndexToOffset_1001_get(linearIndex, &keys_info[0]);
  const unsigned int valueStartOffset =
    IndexToOffset_999_get(linearIndex, &values_info[0]);

  // If the sort size is 1, the data is already sorted
  if (power2size == 1) {
    return;
  } else {
    // Otherwise, each thread is responsible for loading and storing 2
    // elements. The sort size is guaranteed to be >= 2
    const int elem1 = get_local_id(0);
    const int elem2 = get_local_id(0) + (power2size >> 1);

    int valid1 = (elem1 < keySliceSize);
    float k1 = valid1 ?
      keys_data[keyStartOffset + elem1 * keySliceStride] : (float) 0;
    float v1 = valid1 ?
      values_data[valueStartOffset + elem1 * valueSliceStride] : (float) 0;

    p_sharedKeys[elem1] = k1;
    p_sharedValues[elem1] = v1;
    p_sharedValid[elem1] = valid1;

    int valid2 = (elem2 < keySliceSize);
    float k2 = valid2 ?
      keys_data[keyStartOffset + elem2 * keySliceStride] : (float) 0;
    float v2 = valid2 ?
      values_data[valueStartOffset + elem2 * valueSliceStride] : (float) 0;

    p_sharedKeys[elem2] = k2;
    p_sharedValues[elem2] = v2;
    p_sharedValid[elem2] = valid2;

    barrier(CLK_LOCAL_MEM_FENCE);

    // Sort!
 //    if(get_local_id(0) == 0) {
    bitonicSort(power2size, p_sharedKeys, p_sharedValues, p_sharedValid);
 //   }

 ////    if(get_local_id(0) == 0) {
 //      keys_data[0] = sharedKeys[0];
 //      keys_data[1] = sharedKeys[1];
 ////      keys_data[0] = elem1;
 ////      keys_data[1] = elem2;
 ////      values_data[0] = 128;
 //      values_data[0] = sharedValues[0];
 //      values_data[1] = sharedValues[1];
 ////    }


    // elem1 values are always valid, since otherwise we would have
    // chosen the next smallest power-of-2 for sorting
    keys_data[keyStartOffset + elem1 * keySliceStride] =
      p_sharedKeys[elem1];
    values_data[valueStartOffset + elem1 * valueSliceStride] =
      p_sharedValues[elem1];

    if (valid2) {
      // elem2 values might be out-of-range, if the data size we are
      // sorting is not a power-of-2
      keys_data[keyStartOffset + elem2 * keySliceStride] =
        p_sharedKeys[elem2];
      values_data[valueStartOffset + elem2 * valueSliceStride] =
        p_sharedValues[elem2];
    }
  }
 }
diff --git a/testoclgrind2.py b/testoclgrind2.py
 import time
 import numpy as np
 import pyopencl as cl
 import struct

 #div = int(1e2)
 #N = 1024
 #its = 3

 N = 3
 power2 = 4

 gpu_idx = 0

 platforms = cl.get_platforms()
 i = 0
 for platform in platforms:
   gpu_devices = platform.get_devices(device_type=cl.device_type.GPU)
   if gpu_idx < i + len(gpu_devices):
       ctx = cl.Context(devices=[gpu_devices[gpu_idx-i]])
       break
   i += len(gpu_devices)

 print('context', ctx)
 #ctx = cl.create_some_context()
 q = cl.CommandQueue(ctx)

 mf = cl.mem_flags

 with open('k1.cl', 'r') as f:
    k1 = f.read()

 with open('k2.cl', 'r') as f:
    k2 = f.read()

 k1_prog = cl.Program(ctx, k1).build()
 k2_prog = cl.Program(ctx, k2).build()

 k1_fn = k1_prog.fillSliceWithIndex
 k2_fn = k2_prog.bitonicSortKVInPlace

 for it in range(2):
    a = np.random.rand(N).astype(np.float32)
    b = np.random.rand(N).astype(np.float32)
    a.fill(0)
    b.fill(0)
    b[0] = 3
    b[1] = 2
    b[2] = 7

    a_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)
    b_gpu = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b)

    a_info = struct.pack('iiii', N, 1, 0, 1)
    print('a_info', a_info, 'len', len(a_info))
    for i in range(len(a_info)):
      print(i, a_info[i])
    a_info_buf = cl.Buffer(ctx, mf.READ_ONLY, len(a_info))
    cl.enqueue_write_buffer(q, a_info_buf, a_info).wait()
    q.finish()

    grid = (1,)
    block = (N,)
    globalsize = (grid[0]*block[0],)
    print('grid', grid, 'block', block, 'globalsize', globalsize)
    print('calling k1...')
    k1_fn(q, globalsize, block, a_info_buf, a_gpu, np.int32(1), np.int32(N), np.int32(1))
    q.finish()

    cl.enqueue_copy(q, a, a_gpu)
    print('a', a)
    #sys.exit(1)
    grid = (1,)
    block = (power2//2,)
    globalsize = (grid[0]*block[0],)
    print('grid', grid, 'block', block, 'globalsize', globalsize)
    print('calling k2...')
    k2_fn(q, globalsize, block,
          a_info_buf, b_gpu,
          np.int32(1), np.int32(N), np.int32(1),
           np.int32(power2),
          a_info_buf, a_gpu,
          np.int32(1),
          cl.LocalMemory(power2 * 4),
          cl.LocalMemory(power2 * 4), 
          cl.LocalMemory(power2 * 4)
          )
    q.finish()

    cl.enqueue_copy(q, a, a_gpu)
    cl.enqueue_copy(q, b, b_gpu)
    q.finish()
    print('a', a)
    print('b', b)
	// from lib/THC/THCTensorSort.cu:

	// needs following tmeplate variables defined:
	// Dim integer
	// IndexType string, eg 'int', or 'unsigned int'
	// dims list containing Dim
	// MAX_CLTORCH_DIMS integer, eg 25
	// WarpSize integer eg 32

	// this needs the following template variables defined:
	// IndexType string, eg 'int'
	// MAX_CLTORCH_DIMS integer, eg 25
	// dims list of integers, ie all dimensions >=0 this should work for
	// WarpSize integer eg 32
	// defiscontiguous [1\|0] (or just dont define, means 0)
	// defreduceblock [1\|0] (or just dont define, means 0)


	// kernel argument that defines tensor layout
	typedef struct TensorInfoCl {
	// Extracts size/stride information for the kernel.
	// Successive dimensions can be collapsed if the size/strides match
	// up and thus there are no holes between the dimensions. This is used
	// to reduce the complexity of the problem.
	// The optional `reduceDim` indicates a reduction dimension for the
	// given tensor, so that the output size for this dimension will be 1.

	unsigned int sizes[1];
	unsigned int strides[1];
	unsigned int offset;
	int dims;
	} TensorInfoCl;
	// Contiguous tensors of more than one dimension are collapsed down
	// to one tensor


	// Translate a linear index for the apply to a float* offset;
	// specialized on `Dims` to reduce nvcc compilation time

	static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
	unsigned int offset = info->offset;

	// Use static dims
	// for (int i = 2 - 1; i >= 0; --i) {
	unsigned int curDimIndex;
	unsigned int curDimOffset;
	// // bake this in....
	// curDimIndex = linearId % info->sizes[1];
	// curDimOffset = curDimIndex * info->strides[1];
	// offset += curDimOffset;

	//
	// linearId /= info->sizes[1];

	// bake this in....
	curDimIndex = linearId % info->sizes[0];
	curDimOffset = curDimIndex * info->strides[0];
	offset += curDimOffset;



	// }

	return offset;
	}


	static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
	return linearId + info->offset;
	}

	static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
	unsigned int offset = info->offset;

	// Use dynamic dims
	for (int i = info->dims - 1; i >= 0; --i) {
	unsigned int curDimIndex = linearId % info->sizes[i];
	unsigned int curDimOffset = curDimIndex * info->strides[i];
	offset += curDimOffset;

	linearId /= info->sizes[i];
	}

	return offset;
	}

	static inline unsigned int getLinearBlockId() {
	return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
	get_group_id(1) * get_num_groups(0) +
	get_group_id(0);
	}

	// Block-wide reduction in shared memory helper; only /threadIdx.x/ get_local_id(0) == 0 will
	// return the reduced value




	// `base` is the base address of a tensor
	// For each slice (defined as a linear point of `out`, from 0 ->
	// (sliceSize - 1) * sliceStride, we fill that slice from `0` to
	// `sliceSize - 1`.
	kernel void
	fillSliceWithIndex(global TensorInfoCl out_info, global float out_data,
	unsigned int totalSlices,
	unsigned int sliceSize,
	unsigned int sliceStride) {
	unsigned int slice = getLinearBlockId();

	if (slice >= totalSlices) {
	return;
	}

	const unsigned long offset =
	IndexToOffset_1001_get(slice, &out_info[0]);
	global float* base = &out_data[offset];

	for (long i = get_local_id(0); i < sliceSize; i += get_local_size(0)) {
	// Torch indices are 1-based (hence the +1)
	base[i * sliceStride] = (float) i + 1.0f;
	}
	}
	// from lib/THC/THCSortUtils.cuh:

	// This needs the following template variables:
	// K key type
	// V value type
	// COMPARE_OP a comparison operator, like < or >
	// KeyDims integer
	// ValueDims integer
	// Power2SortSize integer
	// dims list of KeyDims and ValueDims

	// you need to somewhere include before this, with appropriate dims, to include
	// KeyDims and ValueDims

	// this needs the following template variables defined:
	// IndexType string, eg 'int'
	// MAX_CLTORCH_DIMS integer, eg 25
	// dims list of integers, ie all dimensions >=0 this should work for
	// WarpSize integer eg 32
	// defiscontiguous [1\|0] (or just dont define, means 0)
	// defreduceblock [1\|0] (or just dont define, means 0)


	// kernel argument that defines tensor layout
	typedef struct TensorInfoCl {
	// Extracts size/stride information for the kernel.
	// Successive dimensions can be collapsed if the size/strides match
	// up and thus there are no holes between the dimensions. This is used
	// to reduce the complexity of the problem.
	// The optional `reduceDim` indicates a reduction dimension for the
	// given tensor, so that the output size for this dimension will be 1.

	unsigned int sizes[1];
	unsigned int strides[1];
	unsigned int offset;
	int dims;
	} TensorInfoCl;
	// Contiguous tensors of more than one dimension are collapsed down
	// to one tensor


	// Translate a linear index for the apply to a float* offset;
	// specialized on `Dims` to reduce nvcc compilation time

	static inline unsigned int IndexToOffset_1001_get( unsigned int linearId, global TensorInfoCl *info) {
	unsigned int offset = info->offset;

	// Use static dims
	// for (int i = 2 - 1; i >= 0; --i) {
	unsigned int curDimIndex;
	unsigned int curDimOffset;
	// bake this in....
	// curDimIndex = linearId % info->sizes[1];
	// curDimOffset = curDimIndex * info->strides[1];
	// offset += curDimOffset;

	//
	// linearId /= info->sizes[1];

	// bake this in....
	curDimIndex = linearId % info->sizes[0];
	curDimOffset = curDimIndex * info->strides[0];
	offset += curDimOffset;



	// }

	return offset;
	}


	static inline unsigned int IndexToOffset_998_get(unsigned int linearId, global const TensorInfoCl *info) {
	return linearId + info->offset;
	}

	static inline unsigned int IndexToOffset_999_get(unsigned int linearId, global const TensorInfoCl *info) {
	unsigned int offset = info->offset;

	// Use dynamic dims
	for (int i = info->dims - 1; i >= 0; --i) {
	unsigned int curDimIndex = linearId % info->sizes[i];
	unsigned int curDimOffset = curDimIndex * info->strides[i];
	offset += curDimOffset;

	linearId /= info->sizes[i];
	}

	return offset;
	}

	static inline unsigned int getLinearBlockId() {
	return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
	get_group_id(1) * get_num_groups(0) +
	get_group_id(0);
	}

	// Block-wide reduction in shared memory helper; only /threadIdx.x/ get_local_id(0) == 0 will
	// return the reduced value




	static inline void swapVars_K(local float p_t1, local floatp_t2) {
	float tmp = *p_t1;
	p_t1 = p_t2;
	*p_t2 = tmp;
	}

	static inline void swapVars_V(local float p_t1, local floatp_t2) {
	float tmp = *p_t1;
	p_t1 = p_t2;
	*p_t2 = tmp;
	}

	static inline void swapVars_int(local int p_t1, local int p_t2) {
	int tmp = *p_t1;
	p_t1 = p_t2;
	*p_t2 = tmp;
	}

	static inline void bitonicSwap(local float* p_kA, local floatp_vA, local intp_validA,
	local float* p_kB, local floatp_vB, local intp_validB,
	int dir) {
	// Invalid entries always sort to the end
	// original cuda version was:
	// int swap = (comp(kA, kB) && validA) \|\| !validB;
	int swap = (((p_kA) < (p_kB)) && (p_validA)) \|\| !(p_validB);
	if (swap == dir) {
	swapVars_K(p_kA, p_kB);
	swapVars_V(p_vA, p_vB);
	swapVars_int(p_validA, p_validB);
	}
	};

	static inline void bitonicSort(int power2size, local float *p_keys,
	local float *p_values,
	local int *p_valid) {
	#pragma unroll
	for (unsigned int size = 2; size < power2size; size *= 2) {
	int flag = ((get_local_id(0) & (size / 2)) != 0);

	#pragma unroll
	for (unsigned int stride = size / 2; stride > 0; stride /= 2) {

	// Single warp per slice is completely synchronous
	// if (128 > 32) { // is 64 ok? Let's try 32 till it is working ok...
	barrier(CLK_LOCAL_MEM_FENCE);
	// }

	unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
	bitonicSwap(
	p_keys + pos, p_values + pos, p_valid + pos,
	p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
	flag);
	}
	}

	#pragma unroll
	for (unsigned int stride = power2size / 2; stride > 0; stride /= 2) {
	// Single warp per slice is completely synchronous
	// if (128 > 32) { // note: was 64 before
	barrier(CLK_LOCAL_MEM_FENCE);
	// }

	unsigned int pos = 2 * get_local_id(0) - (get_local_id(0) & (stride - 1));
	bitonicSwap(
	p_keys + pos, p_values + pos, p_valid + pos,
	p_keys + pos + stride, p_values + pos + stride, p_valid + pos + stride,
	false);
	}

	// Single warp per slice is completely synchronous
	// if (128 > 32) { // note: was 64 before
	barrier(CLK_LOCAL_MEM_FENCE);
	// }
	}

	// Sorts (key, value) pairs (in different tensors) in-place; i.e.,
	// modifies the input `keys` and `values`
	kernel void
	bitonicSortKVInPlace(global TensorInfoCl keys_info, global float keys_data,
	unsigned int keySlices,
	unsigned int keySliceSize,
	unsigned int keySliceStride,
	int power2size,
	global TensorInfoCl values_info, global float values_data,
	unsigned int valueSliceStride,
	local float *p_sharedKeys,
	local float *p_sharedValues,
	local int *p_sharedValid
	) {
	int size0 = keys_info->sizes[0];
	int stride0 = keys_info->strides[0];
	int dims = keys_info->dims;

	// Find the slice of the tensor that we are sorting
	const unsigned int linearIndex = getLinearBlockId();
	// Tiling the slices could have us be out of bounds, if there are a
	// lot of slices to sort
	if (linearIndex >= keySlices) {
	return;
	}

	// local float sharedKeys[128];
	// local float sharedValues[128];
	// local int sharedValid[128];

	const unsigned int keyStartOffset =
	IndexToOffset_1001_get(linearIndex, &keys_info[0]);
	const unsigned int valueStartOffset =
	IndexToOffset_999_get(linearIndex, &values_info[0]);

	// If the sort size is 1, the data is already sorted
	if (power2size == 1) {
	return;
	} else {
	// Otherwise, each thread is responsible for loading and storing 2
	// elements. The sort size is guaranteed to be >= 2
	const int elem1 = get_local_id(0);
	const int elem2 = get_local_id(0) + (power2size >> 1);

	int valid1 = (elem1 < keySliceSize);
	float k1 = valid1 ?
	keys_data[keyStartOffset + elem1 * keySliceStride] : (float) 0;
	float v1 = valid1 ?
	values_data[valueStartOffset + elem1 * valueSliceStride] : (float) 0;

	p_sharedKeys[elem1] = k1;
	p_sharedValues[elem1] = v1;
	p_sharedValid[elem1] = valid1;

	int valid2 = (elem2 < keySliceSize);
	float k2 = valid2 ?
	keys_data[keyStartOffset + elem2 * keySliceStride] : (float) 0;
	float v2 = valid2 ?
	values_data[valueStartOffset + elem2 * valueSliceStride] : (float) 0;

	p_sharedKeys[elem2] = k2;
	p_sharedValues[elem2] = v2;
	p_sharedValid[elem2] = valid2;

	barrier(CLK_LOCAL_MEM_FENCE);

	// Sort!
	// if(get_local_id(0) == 0) {
	bitonicSort(power2size, p_sharedKeys, p_sharedValues, p_sharedValid);
	// }

	//// if(get_local_id(0) == 0) {
	// keys_data[0] = sharedKeys[0];
	// keys_data[1] = sharedKeys[1];
	//// keys_data[0] = elem1;
	//// keys_data[1] = elem2;
	//// values_data[0] = 128;
	// values_data[0] = sharedValues[0];
	// values_data[1] = sharedValues[1];
	//// }


	// elem1 values are always valid, since otherwise we would have
	// chosen the next smallest power-of-2 for sorting
	keys_data[keyStartOffset + elem1 * keySliceStride] =
	p_sharedKeys[elem1];
	values_data[valueStartOffset + elem1 * valueSliceStride] =
	p_sharedValues[elem1];

	if (valid2) {
	// elem2 values might be out-of-range, if the data size we are
	// sorting is not a power-of-2
	keys_data[keyStartOffset + elem2 * keySliceStride] =
	p_sharedKeys[elem2];
	values_data[valueStartOffset + elem2 * valueSliceStride] =
	p_sharedValues[elem2];
	}
	}
	}
	import time
	import numpy as np
	import pyopencl as cl
	import struct

	#div = int(1e2)
	#N = 1024
	#its = 3

	N = 3
	power2 = 4

	gpu_idx = 0

	platforms = cl.get_platforms()
	i = 0
	for platform in platforms:
	gpu_devices = platform.get_devices(device_type=cl.device_type.GPU)
	if gpu_idx < i + len(gpu_devices):
	ctx = cl.Context(devices=[gpu_devices[gpu_idx-i]])
	break
	i += len(gpu_devices)

	print('context', ctx)
	#ctx = cl.create_some_context()
	q = cl.CommandQueue(ctx)

	mf = cl.mem_flags

	with open('k1.cl', 'r') as f:
	k1 = f.read()

	with open('k2.cl', 'r') as f:
	k2 = f.read()

	k1_prog = cl.Program(ctx, k1).build()
	k2_prog = cl.Program(ctx, k2).build()

	k1_fn = k1_prog.fillSliceWithIndex
	k2_fn = k2_prog.bitonicSortKVInPlace

	for it in range(2):
	a = np.random.rand(N).astype(np.float32)
	b = np.random.rand(N).astype(np.float32)
	a.fill(0)
	b.fill(0)
	b[0] = 3
	b[1] = 2
	b[2] = 7

	a_gpu = cl.Buffer(ctx, mf.READ_WRITE \| mf.COPY_HOST_PTR, hostbuf=a)
	b_gpu = cl.Buffer(ctx, mf.READ_WRITE \| mf.COPY_HOST_PTR, hostbuf=b)

	a_info = struct.pack('iiii', N, 1, 0, 1)
	print('a_info', a_info, 'len', len(a_info))
	for i in range(len(a_info)):
	print(i, a_info[i])
	a_info_buf = cl.Buffer(ctx, mf.READ_ONLY, len(a_info))
	cl.enqueue_write_buffer(q, a_info_buf, a_info).wait()
	q.finish()

	grid = (1,)
	block = (N,)
	globalsize = (grid[0]*block[0],)
	print('grid', grid, 'block', block, 'globalsize', globalsize)
	print('calling k1...')
	k1_fn(q, globalsize, block, a_info_buf, a_gpu, np.int32(1), np.int32(N), np.int32(1))
	q.finish()

	cl.enqueue_copy(q, a, a_gpu)
	print('a', a)
	#sys.exit(1)
	grid = (1,)
	block = (power2//2,)
	globalsize = (grid[0]*block[0],)
	print('grid', grid, 'block', block, 'globalsize', globalsize)
	print('calling k2...')
	k2_fn(q, globalsize, block,
	a_info_buf, b_gpu,
	np.int32(1), np.int32(N), np.int32(1),
	np.int32(power2),
	a_info_buf, a_gpu,
	np.int32(1),
	cl.LocalMemory(power2 * 4),
	cl.LocalMemory(power2 * 4),
	cl.LocalMemory(power2 * 4)
	)
	q.finish()

	cl.enqueue_copy(q, a, a_gpu)
	cl.enqueue_copy(q, b, b_gpu)
	q.finish()
	print('a', a)
	print('b', b)