ochafik · February 2, 2022 19:04
diff --git a/README.md b/README.md
diff --git a/kernels.cl b/kernels.cl
 // If you change these two values below, change in SiftMatchCL.cpp also
 #define USE_LOCAL_MEMORY 1
 #define WORKGROUP_SIZE 32
 #define VECTOR_TYPE uchar16
 #define VECTOR_LEN 16
 #define VECTOR_ITS (128 / VECTOR_LEN)
 __kernel void FindBestMatchOneWay_Kernel(__global int *result, __global VECTOR_TYPE *des1, __global VECTOR_TYPE *des2, int num1, int num2, float distmax, float ratiomax)
 {
 #if USE_LOCAL_MEMORY
   __local VECTOR_TYPE desc[WORKGROUP_SIZE][VECTOR_ITS+1];
 #endif
   const int i = get_global_id(0);
   if (i < num1)
   {
      // If we're using local memory, then copy this work-item's
      // feature descriptor into a local memory cache.
 #if USE_LOCAL_MEMORY
      const int lid = get_local_id(0);
      for (int k=0; k < VECTOR_ITS; ++k)
         desc[lid][k] = des1[i*VECTOR_ITS + k];
 #endif
      int best_j = -1;
      int best_dist = 0;
      int second_best_dist = 0;
      // Search the vector des2 for the corresponding descriptor
      // that has the highest dot product with descriptor "i" of des1.
      for (int j=0; j < num2; ++j)
      {
         // Compute the dot product for this pair of descriptors
         uint dot = 0;
         for (int d=0; d<VECTOR_ITS; ++d) {
 #if USE_LOCAL_MEMORY
            dot += desc[lid][d].s0 * des2[j*VECTOR_ITS + d].s0;
            dot += desc[lid][d].s1 * des2[j*VECTOR_ITS + d].s1;
            dot += desc[lid][d].s2 * des2[j*VECTOR_ITS + d].s2;
            dot += desc[lid][d].s3 * des2[j*VECTOR_ITS + d].s3;
            dot += desc[lid][d].s4 * des2[j*VECTOR_ITS + d].s4;
            dot += desc[lid][d].s5 * des2[j*VECTOR_ITS + d].s5;
            dot += desc[lid][d].s6 * des2[j*VECTOR_ITS + d].s6;
            dot += desc[lid][d].s7 * des2[j*VECTOR_ITS + d].s7;
            dot += desc[lid][d].s8 * des2[j*VECTOR_ITS + d].s8;
            dot += desc[lid][d].s9 * des2[j*VECTOR_ITS + d].s9;
            dot += desc[lid][d].sA * des2[j*VECTOR_ITS + d].sA;
            dot += desc[lid][d].sB * des2[j*VECTOR_ITS + d].sB;
            dot += desc[lid][d].sC * des2[j*VECTOR_ITS + d].sC;
            dot += desc[lid][d].sD * des2[j*VECTOR_ITS + d].sD;
            dot += desc[lid][d].sE * des2[j*VECTOR_ITS + d].sE;
            dot += desc[lid][d].sF * des2[j*VECTOR_ITS + d].sF;
 #else
            dot += des1[i*VECTOR_ITS + d].s0 * des2[j*VECTOR_ITS + d].s0;
            dot += des1[i*VECTOR_ITS + d].s1 * des2[j*VECTOR_ITS + d].s1;
            dot += des1[i*VECTOR_ITS + d].s2 * des2[j*VECTOR_ITS + d].s2;
            dot += des1[i*VECTOR_ITS + d].s3 * des2[j*VECTOR_ITS + d].s3;
            dot += des1[i*VECTOR_ITS + d].s4 * des2[j*VECTOR_ITS + d].s4;
            dot += des1[i*VECTOR_ITS + d].s5 * des2[j*VECTOR_ITS + d].s5;
            dot += des1[i*VECTOR_ITS + d].s6 * des2[j*VECTOR_ITS + d].s6;
            dot += des1[i*VECTOR_ITS + d].s7 * des2[j*VECTOR_ITS + d].s7;
            dot += des1[i*VECTOR_ITS + d].s8 * des2[j*VECTOR_ITS + d].s8;
            dot += des1[i*VECTOR_ITS + d].s9 * des2[j*VECTOR_ITS + d].s9;
            dot += des1[i*VECTOR_ITS + d].sA * des2[j*VECTOR_ITS + d].sA;
            dot += des1[i*VECTOR_ITS + d].sB * des2[j*VECTOR_ITS + d].sB;
            dot += des1[i*VECTOR_ITS + d].sC * des2[j*VECTOR_ITS + d].sC;
            dot += des1[i*VECTOR_ITS + d].sD * des2[j*VECTOR_ITS + d].sD;
            dot += des1[i*VECTOR_ITS + d].sE * des2[j*VECTOR_ITS + d].sE;
            dot += des1[i*VECTOR_ITS + d].sF * des2[j*VECTOR_ITS + d].sF;
 #endif
         }
         if (dot > best_dist) {
            best_j = j;
            second_best_dist = best_dist;
            best_dist = dot;
         }
         else if (dot > second_best_dist) {
            second_best_dist = dot;
         }
      }
      // The SIFT descriptor vectors are normalized to length 512, so
      // we scale distance by 1/512^2.
      const float best_dist_normed =
         acos(fmin(best_dist * 0.000003814697265625f, 1.0f));
      // Check if match distance fails threshold
      if (best_dist_normed > distmax) {
         result[i] = -1;
         return;
      }
      const float second_best_dist_normed =
         acos(fmin(second_best_dist * 0.000003814697265625f, 1.0f));
      // Check if match fails ratio test
      if (best_dist_normed >= ratiomax * second_best_dist_normed) {
         result[i] = -1;
         return;
      }
      // Record the match as valid!
      result[i] = best_j;
   }
 }
 // Guided filter as implemented in the COLMAP CPU version. Parameter "m"
 // is a 3x3 fundamental matrix or homography matrix, depending on the
 // value of "guided". Parameter "errormax" is the squared maximum
 // residual threshold.
 bool Guided(float x1, float y1, float x2, float y2, int guided, __constant float *m, float errormax)
 {
   float residual2;
   const float a0 = m[0]*x1 + m[1]*y1 + m[2];
   const float a1 = m[3]*x1 + m[4]*y1 + m[5];
   const float a2 = m[6]*x1 + m[7]*y1 + m[8];
   if (!(guided & 1))   // CALIBRATED or UNCALIBRATED two-view geometry
   {   
      const float b0 = m[0]*x2 + m[3]*y2 + m[6];
      const float b1 = m[1]*x2 + m[4]*y2 + m[7];
      const float c = x2*a0 + y2*a1 + a2;
      residual2 = c * c / (a0*a0 + a1*a1 + b0*b0 + b1*b1);
   }
   else               // PLANAR or PANORAMIC two-view geometry
   {
      const float b0 = a0 / a2 - x2;
      const float b1 = a1 / a2 - y2;
      residual2 = b0*b0 + b1*b1;
   }
   return residual2 > errormax;
 __kernel void FindGuidedBestMatchOneWay_Kernel(__global int *result, __global VECTOR_TYPE *des1, __global VECTOR_TYPE *des2, __global float *loc1, __global float *loc2, int num1, int num2, int guided,    // 0 = Fundamental matrix, first pass
                     // 1 = Homography matrix, first pass
                     // 2 = Fundamental matrix, second pass of cross-check
                     // 3 = Homography matrix, second pass of cross-check
      __constant float *gm, // Fundamental or homography matrix for guided filter
      float distmax, float ratiomax, float errormax)
 #if USE_LOCAL_MEMORY
   __local VECTOR_TYPE desc[WORKGROUP_SIZE][VECTOR_ITS+1];
 #endif
   const int i = get_global_id(0);
   if (i < num1)
   {
      // If we're using local memory, then copy this work-item's
      // feature descriptor into a local memory cache.
 #if USE_LOCAL_MEMORY
      const int lid = get_local_id(0);
      for (int k=0; k < VECTOR_ITS; ++k)
         desc[lid][k] = des1[i*VECTOR_ITS + k];
 #endif
      int best_j = -1;
      int best_dist = 0;
      int second_best_dist = 0;
      // Search the vector des2 for the corresponding descriptor
      // that has the highest dot product with descriptor "i" of des1.
      for (int j=0; j < num2; ++j)
      {
         float x1, y1, x2, y2;
         uint dot = 0;
         
         // If this is the cross-check pass, we swap the order of the
         // pixel coordinates, since the gm matrix was computed as the
         // transform from the first image to the second.
         if (guided & 2)      
         {
            x1 = loc2[2*j];
            y1 = loc2[2*j+1];
            x2 = loc1[2*i];
            y2 = loc1[2*i+1];
         }
         else
         {
            x1 = loc1[2*i];
            y1 = loc1[2*i+1];
            x2 = loc2[2*j];
            y2 = loc2[2*j+1];
         }
         // Now apply the guided filter, and if the feature pair is not
         // filtered out, then go ahead and compute the matching score.
         if (!Guided(x1, y1, x2, y2, guided, gm, errormax))
         {
            // Compute the dot product for this pair of descriptors
            for (int d=0; d<VECTOR_ITS; ++d) {
 #if USE_LOCAL_MEMORY
               dot += desc[lid][d].s0 * des2[j*VECTOR_ITS + d].s0;
               dot += desc[lid][d].s1 * des2[j*VECTOR_ITS + d].s1;
               dot += desc[lid][d].s2 * des2[j*VECTOR_ITS + d].s2;
               dot += desc[lid][d].s3 * des2[j*VECTOR_ITS + d].s3;
               dot += desc[lid][d].s4 * des2[j*VECTOR_ITS + d].s4;
               dot += desc[lid][d].s5 * des2[j*VECTOR_ITS + d].s5;
               dot += desc[lid][d].s6 * des2[j*VECTOR_ITS + d].s6;
               dot += desc[lid][d].s7 * des2[j*VECTOR_ITS + d].s7;
               dot += desc[lid][d].s8 * des2[j*VECTOR_ITS + d].s8;
               dot += desc[lid][d].s9 * des2[j*VECTOR_ITS + d].s9;
               dot += desc[lid][d].sA * des2[j*VECTOR_ITS + d].sA;
               dot += desc[lid][d].sB * des2[j*VECTOR_ITS + d].sB;
               dot += desc[lid][d].sC * des2[j*VECTOR_ITS + d].sC;
               dot += desc[lid][d].sD * des2[j*VECTOR_ITS + d].sD;
               dot += desc[lid][d].sE * des2[j*VECTOR_ITS + d].sE;
               dot += desc[lid][d].sF * des2[j*VECTOR_ITS + d].sF;
 #else
               dot += des1[i*VECTOR_ITS + d].s0 * des2[j*VECTOR_ITS + d].s0;
               dot += des1[i*VECTOR_ITS + d].s1 * des2[j*VECTOR_ITS + d].s1;
               dot += des1[i*VECTOR_ITS + d].s2 * des2[j*VECTOR_ITS + d].s2;
               dot += des1[i*VECTOR_ITS + d].s3 * des2[j*VECTOR_ITS + d].s3;
               dot += des1[i*VECTOR_ITS + d].s4 * des2[j*VECTOR_ITS + d].s4;
               dot += des1[i*VECTOR_ITS + d].s5 * des2[j*VECTOR_ITS + d].s5;
               dot += des1[i*VECTOR_ITS + d].s6 * des2[j*VECTOR_ITS + d].s6;
               dot += des1[i*VECTOR_ITS + d].s7 * des2[j*VECTOR_ITS + d].s7;
               dot += des1[i*VECTOR_ITS + d].s8 * des2[j*VECTOR_ITS + d].s8;
               dot += des1[i*VECTOR_ITS + d].s9 * des2[j*VECTOR_ITS + d].s9;
               dot += des1[i*VECTOR_ITS + d].sA * des2[j*VECTOR_ITS + d].sA;
               dot += des1[i*VECTOR_ITS + d].sB * des2[j*VECTOR_ITS + d].sB;
               dot += des1[i*VECTOR_ITS + d].sC * des2[j*VECTOR_ITS + d].sC;
               dot += des1[i*VECTOR_ITS + d].sD * des2[j*VECTOR_ITS + d].sD;
               dot += des1[i*VECTOR_ITS + d].sE * des2[j*VECTOR_ITS + d].sE;
               dot += des1[i*VECTOR_ITS + d].sF * des2[j*VECTOR_ITS + d].sF;
 #endif
            }
         if (dot > best_dist) {
         
 best_j = j;
                second_best_dist = best_dist;
                best_dist = dot;
         }
         else if (dot > second_best_dist) {
                second_best_dist = dot;
         }
      }
      // The SIFT descriptor vectors are normalized to length 512, so
      // we scale distance by 1/512^2.
      const float best_dist_normed =
         acos(fmin(best_dist * 0.000003814697265625f, 1.0f));
      // Check if match distance fails threshold
      if (best_dist_normed > distmax) {
         result[i] = -1;
         return;
      }
      const float second_best_dist_normed =
         acos(fmin(second_best_dist * 0.000003814697265625f, 1.0f));
      // Check if match fails ratio test
      if (best_dist_normed >= ratiomax * second_best_dist_normed) {
         result[i] = -1;
         return;
      }
      // Record the match as valid!
      result[i] = best_j;
   }
 }



 #define KWINDOWSIZE (KWINDOWRADIUS*2 + 1)
 static inline void Mat33DotVec3(const float mat[9], const float vec[3], float result[3]) {
  result[0] = mat[0] * vec[0] + mat[1] * vec[1] + mat[2] * vec[2];
  result[1] = mat[3] * vec[0] + mat[4] * vec[1] + mat[5] * vec[2];
  result[2] = mat[6] * vec[0] + mat[7] * vec[1] + mat[8] * vec[2];
 }
 static inline void Mat33DotVec3Homogeneous(const float mat[9], const float vec[2], float result[2]) {
  const float inv_z = 1.0f / (mat[6] * vec[0] + mat[7] * vec[1] + mat[8]);
  result[0] = inv_z * (mat[0] * vec[0] + mat[1] * vec[1] + mat[2]);
  result[1] = inv_z * (mat[3] * vec[0] + mat[4] * vec[1] + mat[5]);
 }
 static inline float DotProduct3(const float vec1[3], const float vec2[3]) {
  return vec1[0] * vec2[0] + vec1[1] * vec2[1] + vec1[2] * vec2[2];
 __constant sampler_t PosesImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                         CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 }
 static inline void ComputeViewingAngles(const float point[3], const float normal[3], const int image_idx, float* cos_triangulation_angle, float* cos_incident_angle, image2d_array_t poses_images) {
  *cos_triangulation_angle = 0.0f;
  *cos_incident_angle = 0.0f;
  float C[3];
  for (int i = 0; i < 3; ++i) {
    C[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i + 16, image_idx, 0, 0)).x;
  const float SX[3] = {C[0] - point[0], C[1] - point[1], C[2] - point[2]};
  const float RX_inv_norm = rsqrt(DotProduct3(point, point));
  const float SX_inv_norm = rsqrt(DotProduct3(SX, SX));
  *cos_incident_angle = DotProduct3(SX, normal) * SX_inv_norm;
  *cos_triangulation_angle = DotProduct3(SX, point) * RX_inv_norm * SX_inv_norm;
 }
 static inline void ComposeHomography(image2d_array_t poses_images, const int image_idx, const int row, const int col, const float depth, const float normal[3], float H[9], __constant float * ref_inv_K) {
  float K[4];
  for (int i = 0; i < 4; ++i) {
    K[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i, image_idx, 0, 0)).x;
  float R[9];
  for (int i = 0; i < 9; ++i) {
    R[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i + 4, image_idx, 0, 0)).x;
  float T[3];
  for (int i = 0; i < 3; ++i) {
    T[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i + 13, image_idx, 0, 0)).x;
  const float dist =
      depth * (normal[0] * (ref_inv_K[0] * col + ref_inv_K[1]) +
               normal[1] * (ref_inv_K[2] * row + ref_inv_K[3]) + normal[2]);
  const float inv_dist = 1.0f / dist;
  const float inv_dist_N0 = inv_dist * normal[0];
  const float inv_dist_N1 = inv_dist * normal[1];
  const float inv_dist_N2 = inv_dist * normal[2];
  H[0] = ref_inv_K[0] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
                         K[1] * (R[6] + inv_dist_N0 * T[2]));
  H[1] = ref_inv_K[2] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
                         K[1] * (R[7] + inv_dist_N1 * T[2]));
  H[2] = K[0] * (R[2] + inv_dist_N2 * T[0]) +
         K[1] * (R[8] + inv_dist_N2 * T[2]) +
         ref_inv_K[1] * (K[0] * (R[0] + inv_dist_N0 * T[0]) +
                         K[1] * (R[6] + inv_dist_N0 * T[2])) +
         ref_inv_K[3] * (K[0] * (R[1] + inv_dist_N1 * T[0]) +
                         K[1] * (R[7] + inv_dist_N1 * T[2]));
  H[3] = ref_inv_K[0] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
                         K[3] * (R[6] + inv_dist_N0 * T[2]));
  H[4] = ref_inv_K[2] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
                         K[3] * (R[7] + inv_dist_N1 * T[2]));
  H[5] = K[2] * (R[5] + inv_dist_N2 * T[1]) +
         K[3] * (R[8] + inv_dist_N2 * T[2]) +
         ref_inv_K[1] * (K[2] * (R[3] + inv_dist_N0 * T[1]) +
                         K[3] * (R[6] + inv_dist_N0 * T[2])) +
         ref_inv_K[3] * (K[2] * (R[4] + inv_dist_N1 * T[1]) +
                         K[3] * (R[7] + inv_dist_N1 * T[2]));
  H[6] = ref_inv_K[0] * (R[6] + inv_dist_N0 * T[2]);
  H[7] = ref_inv_K[2] * (R[7] + inv_dist_N1 * T[2]);
  H[8] = R[8] + ref_inv_K[1] * (R[6] + inv_dist_N0 * T[2]) +
         ref_inv_K[3] * (R[7] + inv_dist_N1 * T[2]) + inv_dist_N2 * T[2];
 __constant sampler_t ReadRefImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                           CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 }
 static inline void ReadRefImageIntoSharedMemory(image2d_array_t ref_image, __local float* local_image, const int row, const int col, const int thread_id) {
  barrier(CLK_LOCAL_MEM_FENCE);
  if (row == 0) {
    int r = row - KWINDOWSIZE / 2;
    for (int i = 0; i < KWINDOWSIZE; ++i) {
      int c = col - LOCAL_MEM_WORK_GROUP_SIZE;
      for (int j = 0; j < 3; ++j) {
        local_image[thread_id + i * 3 * LOCAL_MEM_WORK_GROUP_SIZE +
                    j * LOCAL_MEM_WORK_GROUP_SIZE] = read_imagef(ref_image, ReadRefImageSampler, (int4)(c, r, 0, 0)).x;
        c += LOCAL_MEM_WORK_GROUP_SIZE;
      }
      r += 1;
    }
  } else {
    for (int i = 1; i < KWINDOWSIZE; ++i) {
      for (int j = 0; j < 3; ++j) {
        local_image[thread_id + (i - 1) * 3 * LOCAL_MEM_WORK_GROUP_SIZE +
                    j * LOCAL_MEM_WORK_GROUP_SIZE] =
            local_image[thread_id + i * 3 * LOCAL_MEM_WORK_GROUP_SIZE +
                        j * LOCAL_MEM_WORK_GROUP_SIZE];
      }
    }
    const int r = row + KWINDOWSIZE / 2;
    int c = col - LOCAL_MEM_WORK_GROUP_SIZE;
    const int i = KWINDOWSIZE - 1;
    for (int j = 0; j < 3; ++j) {
      local_image[thread_id + i * 3 * LOCAL_MEM_WORK_GROUP_SIZE +
                  j * LOCAL_MEM_WORK_GROUP_SIZE] = read_imagef(ref_image, ReadRefImageSampler, (int4)(c, r, 0, 0)).x;
      c += LOCAL_MEM_WORK_GROUP_SIZE;
    }
  barrier(CLK_LOCAL_MEM_FENCE);
 }
 __constant sampler_t PccSrcImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                           CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;
 static inline float pcc_compute(const float spatial_normalization_, const float color_normalization_, const float kMaxCost, int row, int col, float depth, const float* normal, const __local float* local_ref_image, float local_ref_sum, float local_ref_squared_sum, int src_image_idx, image2d_array_t src_images, image2d_array_t poses_images, __constant float * ref_inv_K) {
    float tform[9];
    ComposeHomography(poses_images, src_image_idx, row, col, depth, normal, tform, ref_inv_K);
    float x_step[3], y_step[3];
    for (int i=0; i<3; ++i) {
        x_step[i] = KWINDOWSTEP * tform[i*3];
        y_step[i] = KWINDOWSTEP * tform[i*3+1];
    }
    
    const int thread_id = get_local_id(0);
    const int row_start = row - KWINDOWRADIUS;
    const int col_start = col - KWINDOWRADIUS;
    float col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
    float row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
    float z = tform[6] * col_start + tform[7] * row_start + tform[8];
    float base_col_src = col_src;
    float base_row_src = row_src;
    float base_z = z;
    int ref_image_idx = LOCAL_MEM_WORK_GROUP_SIZE - KWINDOWRADIUS + thread_id;
    int ref_image_base_idx = ref_image_idx;
    const float ref_center_color =
        local_ref_image[ref_image_idx + KWINDOWRADIUS * 3 * LOCAL_MEM_WORK_GROUP_SIZE +
                        KWINDOWRADIUS];
    const float ref_color_sum = local_ref_sum;
    const float ref_color_squared_sum = local_ref_squared_sum;
    float src_color_sum = 0.0f;
    float src_color_squared_sum = 0.0f;
    float src_ref_color_sum = 0.0f;
    float bilateral_weight_sum = 0.0f;
    for (int row = -KWINDOWRADIUS; row <= KWINDOWRADIUS; row += KWINDOWSTEP) {
      for (int col = -KWINDOWRADIUS; col <= KWINDOWRADIUS; col += KWINDOWSTEP) {
        const float inv_z = 1.0f / z;
        const float norm_col_src = inv_z * col_src + 0.5f;
        const float norm_row_src = inv_z * row_src + 0.5f;
        const float ref_color = local_ref_image[ref_image_idx];
        const float src_color = read_imagef(src_images, PccSrcImageSampler, (float4)(norm_col_src, norm_row_src, src_image_idx, 0)).x;
        const float spatial_dist_squared = row * row + col * col;
        const float color_dist = ref_center_color - ref_color;
        const float bilateral_weight = exp(-spatial_dist_squared * spatial_normalization_ -
                                         color_dist * color_dist * color_normalization_);
        const float bilateral_weight_src = bilateral_weight * src_color;
        src_color_sum += bilateral_weight_src;
        src_color_squared_sum += bilateral_weight_src * src_color;
        src_ref_color_sum += bilateral_weight_src * ref_color;
        bilateral_weight_sum += bilateral_weight;
        ref_image_idx += KWINDOWSTEP;
        col_src += x_step[0];
        row_src += x_step[1];
        z += x_step[2];
      }
      ref_image_base_idx += KWINDOWSTEP * 3 * LOCAL_MEM_WORK_GROUP_SIZE;
      ref_image_idx = ref_image_base_idx;
      base_col_src += y_step[0];
      base_row_src += y_step[1];
      base_z += y_step[2];
      col_src = base_col_src;
      row_src = base_row_src;
      z = base_z;
    }
    const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
    src_color_sum *= inv_bilateral_weight_sum;
    src_color_squared_sum *= inv_bilateral_weight_sum;
    src_ref_color_sum *= inv_bilateral_weight_sum;
    const float ref_color_var =
        ref_color_squared_sum - ref_color_sum * ref_color_sum;
    const float src_color_var =
        src_color_squared_sum - src_color_sum * src_color_sum;
    const float kMinVar = 1e-5f;
    float firstCost = 0.0f;
    if (ref_color_var < kMinVar || src_color_var < kMinVar) {
      return kMaxCost;
    } else {
      const float src_ref_color_covar =
          src_ref_color_sum - ref_color_sum * src_color_sum;
      const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
      return max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar / src_ref_color_var));
    }
 }
 static inline void ComputePointAtDepth(const float row, const float col, const float depth, float point[3], __constant float * ref_inv_K) {
  point[0] = depth * (ref_inv_K[0] * col + ref_inv_K[1]);
  point[1] = depth * (ref_inv_K[2] * row + ref_inv_K[3]);
  point[2] = depth;
 __constant sampler_t SrcDepthMapImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                         CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 }
 static inline float ComputeGeomConsistencyCost(const float row, const float col, const float depth, const int image_idx, const float max_cost, image2d_array_t poses_images, image2d_array_t src_depth_map_images, __constant float * ref_inv_K, __constant float * ref_K)
 {
  float P[12];
  for (int i = 0; i < 12; ++i) {
    P[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i + 19, image_idx, 0, 0)).x;
  float inv_P[12];
  for (int i = 0; i < 12; ++i) {
    inv_P[i] = read_imagef(poses_images, PosesImageSampler, (int4)(i + 31, image_idx, 0, 0)).x;
  float forward_point[3];
  ComputePointAtDepth(row, col, depth, forward_point, ref_inv_K);
  const float inv_forward_z =
      1.0f / (P[8] * forward_point[0] + P[9] * forward_point[1] +
              P[10] * forward_point[2] + P[11]);
  float src_col =
      inv_forward_z * (P[0] * forward_point[0] + P[1] * forward_point[1] +
                       P[2] * forward_point[2] + P[3]);
  float src_row =
      inv_forward_z * (P[4] * forward_point[0] + P[5] * forward_point[1] +
                       P[6] * forward_point[2] + P[7]);
  const float src_depth = read_imagef(src_depth_map_images, SrcDepthMapImageSampler, (float4)(src_col + 0.5f, src_row + 0.5f, image_idx, 0)).x;
  if (src_depth == 0.0f) {
    return max_cost;
  }
  src_col *= src_depth;
  src_row *= src_depth;
  const float backward_point_x =
      inv_P[0] * src_col + inv_P[1] * src_row + inv_P[2] * src_depth + inv_P[3];
  const float backward_point_y =
      inv_P[4] * src_col + inv_P[5] * src_row + inv_P[6] * src_depth + inv_P[7];
  const float backward_point_z = inv_P[8] * src_col + inv_P[9] * src_row +
                                 inv_P[10] * src_depth + inv_P[11];
  const float inv_backward_point_z = 1.0f / backward_point_z;
  const float backward_col =
      inv_backward_point_z *
      (ref_K[0] * backward_point_x + ref_K[1] * backward_point_z);
  const float backward_row =
      inv_backward_point_z *
      (ref_K[2] * backward_point_y + ref_K[3] * backward_point_z);
  const float diff_col = col - backward_col;
  const float diff_row = row - backward_row;
  return min(max_cost, sqrt(diff_col * diff_col + diff_row * diff_row));
 }
 __attribute__((reqd_work_group_size(LOCAL_MEM_WORK_GROUP_SIZE, 1, 1)))
 __kernel void ComputeInitialCostKernel(__global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_depth, int normal_map_pitch, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, float sigma_spatial, float sigma_color, image2d_array_t ref_image, image2d_array_t src_images, image2d_array_t poses_images, __constant float *ref_inv_K, __local float *local_ref_image) {
  const int thread_id = get_local_id(0);
  const int col = get_global_id(0);
  const float spatial_normalization_ = (1.0f / (2.0f * sigma_spatial * sigma_spatial));
  const float color_normalization_ = (1.0f / (2.0f * sigma_color * sigma_color));
  const float pcc_computer_kMaxCost = 2.0f;
  const __local float * pcc_computer_local_ref_image = local_ref_image;
  int pcc_computer_row = 0;
  int pcc_computer_col = col;
  float pcc_computer_depth = 0.0f;
  float normal[3];
  const float * pcc_computer_normal = normal;
  float pcc_computer_local_ref_sum = 0.0f;
  float pcc_computer_local_ref_squared_sum = 0.0f;
  int pcc_computer_src_image_idx = -1;
  for (int row = 0; row < cost_map_height; ++row) {
    ReadRefImageIntoSharedMemory(ref_image, local_ref_image, row, col, thread_id);
    if (col < cost_map_width) {
      pcc_computer_depth = *((__global float*)((__global char*)depth_map + depth_map_pitch * row) + col);
      for (int slice = 0; slice < normal_map_depth; ++slice) {
        normal[slice] = *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col);
      }
      pcc_computer_local_ref_sum = *((__global float*)((__global char*)ref_sum_image + ref_sum_pitch * row) + col); 
      pcc_computer_local_ref_squared_sum = *((__global float*)((__global char*)ref_squared_sum_image + ref_squared_sum_pitch * row) + col);
      for (int image_idx = 0; image_idx < cost_map_depth; ++image_idx) {
        pcc_computer_src_image_idx = image_idx;
 float pcc_computer_result = pcc_compute(spatial_normalization_, color_normalization_, pcc_computer_kMaxCost, pcc_computer_row, pcc_computer_col, pcc_computer_depth, pcc_computer_normal, pcc_computer_local_ref_image, pcc_computer_local_ref_sum, pcc_computer_local_ref_squared_sum, pcc_computer_src_image_idx, src_images, poses_images, ref_inv_K);
        *((__global float*)((__global char*)cost_map + cost_map_pitch * (image_idx * cost_map_height + row)) + col) = pcc_computer_result;
      }
      pcc_computer_row += 1;
    }
  }
 }
 typedef struct {
   unsigned int d, v[5];
 } clrandState;
 __kernel void InitRandomStateKernel(__global clrandState *output, int width, int height, uint pitch)
 {
   const uint row = get_global_id(1);
   const uint col = get_global_id(0);
   if (col < width && row < height) {
      const uint id = row*width + col;
      __global clrandState *state =
         (__global clrandState*)((__global char*)output + pitch*row) + col;
      state->v[0] = 123456789U;
      state->v[1] = 362436069U;
      state->v[2] = 521288629U;
      state->v[3] = 88675123U;
      state->v[4] = 5783321U;
      state->d = 6615241U;
      const uint s0 = id ^ 0x2c7f967fU;
      const uint s1 = 0 ^ 0xa03697cbU;
      const uint t0 = 1228688033U * s0;
      const uint t1 = 2073658381U * s1;
      state->v[0] += t0;
      state->v[1] ^= t0;
      state->v[2] += t1;
      state->v[3] ^= t1;
      state->v[4] += t0;
      state->d += t1 + t0;
   }
 }
 static inline float clrand_uniform(clrandState* state)
 {
   const float uint_inc = 2.3283064e-10f;
   unsigned int t = state->v[0] ^ (state->v[0] >> 2);
   state->v[0] = state->v[1];
   state->v[1] = state->v[2];
   state->v[2] = state->v[3];
   state->v[3] = state->v[4];
   state->v[4] = (state->v[4] ^ (state->v[4] << 4)) ^ (t ^ (t << 1));
   state->d += 362437;
   t = state->d + state->v[4];
   return uint_inc + (t * uint_inc);
 }
 static inline void GenerateRandomNormal(const int row, const int col, clrandState* rand_state, float normal[3], __constant float * ref_inv_K) {
  float v1 = 0.0f;
  float v2 = 0.0f;
  float s = 2.0f;
  while (s >= 1.0f) {
    v1 = 2.0f * clrand_uniform(rand_state) - 1.0f;
    v2 = 2.0f * clrand_uniform(rand_state) - 1.0f;
    s = v1 * v1 + v2 * v2;
  }
  const float s_norm = sqrt(1.0f - s);
  normal[0] = 2.0f * v1 * s_norm;
  normal[1] = 2.0f * v2 * s_norm;
  normal[2] = 1.0f - 2.0f * s;
  const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1], ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
  if (DotProduct3(normal, view_ray) > 0) {
    normal[0] = -normal[0];
    normal[1] = -normal[1];
    normal[2] = -normal[2];
  }
 }
 static inline float GenerateRandomDepth(const float depth_min, const float depth_max, clrandState* rand_state) {
  return clrand_uniform(rand_state) * (depth_max - depth_min) + depth_min;
 }
 static inline float PerturbDepth(const float perturbation, const float depth, clrandState* rand_state) {
  const float depth_min = (1.0f - perturbation) * depth;
  const float depth_max = (1.0f + perturbation) * depth;
  return GenerateRandomDepth(depth_min, depth_max, rand_state);
 }
 static inline void PerturbNormal(const int row, const int col, float perturbation, const float normal[3], clrandState* rand_state, float perturbed_normal[3], __constant float * ref_inv_K) {
  int num_trials = 0;
  do {
    const float a1 = (clrand_uniform(rand_state) - 0.5f) * perturbation;
    const float a2 = (clrand_uniform(rand_state) - 0.5f) * perturbation;
    const float a3 = (clrand_uniform(rand_state) - 0.5f) * perturbation;
    const float sin_a1 = sin(a1);
    const float sin_a2 = sin(a2);
    const float sin_a3 = sin(a3);
    const float cos_a1 = cos(a1);
    const float cos_a2 = cos(a2);
    const float cos_a3 = cos(a3);
    float R[9];
    R[0] = cos_a2 * cos_a3;
    R[1] = -cos_a2 * sin_a3;
    R[2] = sin_a2;
    R[3] = cos_a1 * sin_a3 + cos_a3 * sin_a1 * sin_a2;
    R[4] = cos_a1 * cos_a3 - sin_a1 * sin_a2 * sin_a3;
    R[5] = -cos_a2 * sin_a1;
    R[6] = sin_a1 * sin_a3 - cos_a1 * cos_a3 * sin_a2;
    R[7] = cos_a3 * sin_a1 + cos_a1 * sin_a2 * sin_a3;
    R[8] = cos_a1 * cos_a2;
    Mat33DotVec3(R, normal, perturbed_normal);
    const float view_ray[3] = {ref_inv_K[0] * col + ref_inv_K[1], ref_inv_K[2] * row + ref_inv_K[3], 1.0f};
    if (DotProduct3(perturbed_normal, view_ray) < 0.0f) {
      const float inv_norm = rsqrt(DotProduct3(perturbed_normal, perturbed_normal));
      perturbed_normal[0] *= inv_norm;
      perturbed_normal[1] *= inv_norm;
      perturbed_normal[2] *= inv_norm;
      return;
    }
    perturbation *= 0.5f;
    ++num_trials;
  }
  while (num_trials < 4);
  perturbed_normal[0] = normal[0];
  perturbed_normal[1] = normal[1];
  perturbed_normal[2] = normal[2];
 }
 static inline float PropagateDepth(__constant float * ref_inv_K, const float depth1, const float normal1[3], const float row1, const float row2) {
  const float x1 = depth1 * (ref_inv_K[2] * row1 + ref_inv_K[3]);
  const float y1 = depth1;
  const float x2 = x1 + normal1[2];
  const float y2 = y1 - normal1[1];
  const float x4 = ref_inv_K[2] * row2 + ref_inv_K[3];
  const float denom = x2 - x1 + x4 * (y1 - y2);
  const float kEps = 1e-5f;
  if (fabs(denom) < kEps) {
    return depth1;
  }
  const float nom = y1 * x2 - x1 * y2;
  return nom / denom;
 }
 static inline float likelihood_computer_ComputeNCCProb(const float cost, float likelihood_computer_inv_ncc_sigma_square_, float likelihood_computer_ncc_norm_factor_) {
    return exp(cost * cost * likelihood_computer_inv_ncc_sigma_square_) * likelihood_computer_ncc_norm_factor_;
 }
 static inline float likelihood_computer_ComputeTriProb(const float cos_triangulation_angle, float likelihood_computer_cos_min_triangulation_angle_) {
    const float abs_cos_triangulation_angle = fabs(cos_triangulation_angle);
    if (abs_cos_triangulation_angle > likelihood_computer_cos_min_triangulation_angle_) {
      const float scaled = 1.0f - (1.0f - abs_cos_triangulation_angle) /
                                      (1.0f - likelihood_computer_cos_min_triangulation_angle_);
      const float likelihood = 1.0f - scaled * scaled;
      return min(1.0f, max(0.0f, likelihood));
    } else {
      return 1.0f;
    }
 }
 static inline float likelihood_computer_ComputeIncProb(const float cos_incident_angle, float likelihood_computer_inv_incident_angle_sigma_square_) {
    const float x = 1.0f - max(0.0f, cos_incident_angle);
    return exp(x * x * likelihood_computer_inv_incident_angle_sigma_square_);
 }
 static inline float likelihood_computer_ComputeSelProb(const float alpha, const float beta, const float prev, const float prev_weight) {
    const float zn0 = (1.0f - alpha) * (1.0f - beta);
    const float zn1 = alpha * beta;
    const float curr = zn1 / (zn0 + zn1);
    return prev_weight * prev + (1.0f - prev_weight) * curr;
 }
 static inline float likelihood_computer_ComputeResolutionProb(const float H[9], const float row, const float col) {
    float src1[2];
    const float ref1[2] = {col - KWINDOWRADIUS, row - KWINDOWRADIUS};
    Mat33DotVec3Homogeneous(H, ref1, src1);
    float src2[2];
    const float ref2[2] = {col - KWINDOWRADIUS, row + KWINDOWRADIUS};
    Mat33DotVec3Homogeneous(H, ref2, src2);
    float src3[2];
    const float ref3[2] = {col + KWINDOWRADIUS, row + KWINDOWRADIUS};
    Mat33DotVec3Homogeneous(H, ref3, src3);
    float src4[2];
    const float ref4[2] = {col + KWINDOWRADIUS, row - KWINDOWRADIUS};
    Mat33DotVec3Homogeneous(H, ref4, src4);
    const float ref_area = KWINDOWSIZE * KWINDOWSIZE;
    const float src_area =
        fabs(0.5f * (src1[0] * src2[1] - src2[0] * src1[1] - src1[0] * src4[1] +
                     src2[0] * src3[1] - src3[0] * src2[1] + src4[0] * src1[1] +
                     src3[0] * src4[1] - src4[0] * src3[1]));
    if (ref_area > src_area) {
      return src_area / ref_area;
    } else {
      return ref_area / src_area;
    }
 }
 static inline float likelihood_computer_ComputeMessage(bool kForward, const float cost, const float prev, float likelihood_computer_inv_ncc_sigma_square_, float likelihood_computer_ncc_norm_factor_) {
    const float kUniformProb = 0.5f;
    const float kNoChangeProb = 0.99999f;
    const float kChangeProb = 1.0f - kNoChangeProb;
    const float emission = likelihood_computer_ComputeNCCProb(cost, likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
    float zn0;
    float zn1;
    if (kForward) {
      zn0 = (prev * kChangeProb + (1.0f - prev) * kNoChangeProb) * kUniformProb;
      zn1 = (prev * kNoChangeProb + (1.0f - prev) * kChangeProb) * emission;
    } else {
      zn0 = prev * emission * kChangeProb +
            (1.0f - prev) * kUniformProb * kNoChangeProb;
      zn1 = prev * emission * kNoChangeProb +
            (1.0f - prev) * kUniformProb * kChangeProb;
    }
    return zn1 / (zn0 + zn1);
 }
 static inline float likelihood_computer_ComputeForwardMessage(const float cost, const float prev, float likelihood_computer_inv_ncc_sigma_square_, float likelihood_computer_ncc_norm_factor_) {
    return likelihood_computer_ComputeMessage(true, cost, prev, likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
 }
 static inline float likelihood_computer_ComputeBackwardMessage(const float cost, const float prev, float likelihood_computer_inv_ncc_sigma_square_, float likelihood_computer_ncc_norm_factor_) {
    return likelihood_computer_ComputeMessage(false, cost, prev, likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
 }
 static inline float likelihood_computer_ComputeNCCCostNormFactor(const float ncc_sigma) {
  return 2.0f / (sqrt(2.0f * M_PI_F) * ncc_sigma *
                 erf(2.0f / (ncc_sigma * 1.414213562f)));
 }
 typedef struct {
  float depth;
  float normal[3];
 } ParamState;
 __kernel void ComputeBackwardMessagesKernel(__global float *global_workspace, int global_workspace_height, __global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, float ncc_sigma)
 {
  const int col = get_global_id(0);
  const float kUniformProb = 0.5f;
  float likelihood_computer_inv_ncc_sigma_square_ = -0.5f / (ncc_sigma * ncc_sigma);
  float likelihood_computer_ncc_norm_factor_ = likelihood_computer_ComputeNCCCostNormFactor(ncc_sigma);
  __global float* forward_message = &global_workspace[col * global_workspace_height];
  if (col < cost_map_width) {
    for (int image_idx = 0; image_idx < cost_map_depth; ++image_idx) {
      float beta = kUniformProb;
      for (int row = cost_map_height - 1; row >= 0; --row) {
        const float cost = *((__global float*)((__global char*)cost_map + cost_map_pitch * (image_idx * cost_map_height + row)) + col);
        beta = likelihood_computer_ComputeBackwardMessage(cost, beta, likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
        *((__global float*)((__global char*)sel_prob_map + sel_prob_map_pitch * (image_idx * sel_prob_map_height + row)) + col) = beta;
      }
      forward_message[image_idx] = kUniformProb;
    }
  }
 }
 __kernel void HypothesesAndProbsKernel(int row, __global float *global_workspace, int global_workspace_width, int global_workspace_height, __global float *hypotheses_workspace, int hypotheses_workspace_height, __global clrandState *rand_state_map, __global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float *prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float min_triangulation_angle, float incident_angle_sigma, float ncc_sigma, float perturbation, float prev_sel_prob_weight, image2d_array_t poses_images,
 #if HYPOTHESIS_NUMBER > 0
                                       __constant float *ref_inv_K, __global float *scaledfilt1_depth_map, __global float *scaledfilt1_normal_map, __global float *scaledfilt2_depth_map, __global float *scaledfilt2_normal_map)
 #else
                                       __constant float *ref_inv_K)
 #endif
 {
  const int col = get_global_id(0);
  if (col >= cost_map_width)
    return;
  ParamState prev_param_state;
  ParamState curr_param_state;
  ParamState rand_param_state;
  curr_param_state.depth = *((__global float*)((__global char*)depth_map + depth_map_pitch * row) + col);
  for (int slice = 0; slice < 3; ++slice)
    curr_param_state.normal[slice] = *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col);
  if (row == 0) {
    prev_param_state.depth = *(depth_map + col);
    for (int slice = 0; slice < 3; ++slice)
      prev_param_state.normal[slice] = *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + 0)) + col);
  else {
    prev_param_state.depth = *((__global float*)((__global char*)depth_map + depth_map_pitch * (row-1)) + col);
    for (int slice = 0; slice < 3; ++slice)
      prev_param_state.normal[slice] = *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + (row-1))) + col);
  prev_param_state.depth = PropagateDepth(ref_inv_K, prev_param_state.depth, prev_param_state.normal, row - 1, row);
  clrandState rand_state = *(rand_state_map + col);
  rand_param_state.depth = PerturbDepth(perturbation, curr_param_state.depth, &rand_state);
  PerturbNormal(row, col, perturbation * M_PI_F, curr_param_state.normal, &rand_state, rand_param_state.normal, ref_inv_K);
  *(rand_state_map + col) = rand_state;
 #define SET_HYPOTHESIS(h, slice) (hypotheses_workspace[(slice)*global_workspace_width*hypotheses_workspace_height + (h)*global_workspace_width + col])
  SET_HYPOTHESIS(0, 0) = prev_param_state.depth;
  SET_HYPOTHESIS(0, 1) = prev_param_state.normal[0];
  SET_HYPOTHESIS(0, 2) = prev_param_state.normal[1];
  SET_HYPOTHESIS(0, 3) = prev_param_state.normal[2];
  SET_HYPOTHESIS(1, 0) = rand_param_state.depth;
  SET_HYPOTHESIS(1, 1) = curr_param_state.normal[0];
  SET_HYPOTHESIS(1, 2) = curr_param_state.normal[1];
  SET_HYPOTHESIS(1, 3) = curr_param_state.normal[2];
  SET_HYPOTHESIS(2, 0) = curr_param_state.depth;
  SET_HYPOTHESIS(2, 1) = rand_param_state.normal[0];
  SET_HYPOTHESIS(2, 2) = rand_param_state.normal[1];
  SET_HYPOTHESIS(2, 3) = rand_param_state.normal[2];
  SET_HYPOTHESIS(3, 0) = rand_param_state.depth;
  SET_HYPOTHESIS(3, 1) = rand_param_state.normal[0];
  SET_HYPOTHESIS(3, 2) = rand_param_state.normal[1];
  SET_HYPOTHESIS(3, 3) = rand_param_state.normal[2];
 #if HYPOTHESIS_NUMBER > 0
  ParamState scaledfilt1_param_state;
  ParamState scaledfilt2_param_state;
  scaledfilt1_param_state.depth = *((__global float*)((__global char*)scaledfilt1_depth_map + depth_map_pitch * row) + col);
  for (int slice = 0; slice < 3; ++slice)
    scaledfilt1_param_state.normal[slice] = *((__global float*)((__global char*)scaledfilt1_normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col);
  scaledfilt2_param_state.depth = *((__global float*)((__global char*)scaledfilt2_depth_map + depth_map_pitch * row) + col);
  for (int slice = 0; slice < 3; ++slice)
    scaledfilt2_param_state.normal[slice] = *((__global float*)((__global char*)scaledfilt2_normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col);
  int hyp = 4;
  SET_HYPOTHESIS(hyp, 0) = (curr_param_state.depth + prev_param_state.depth)/2.0f;
  SET_HYPOTHESIS(hyp, 1) = prev_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = prev_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = prev_param_state.normal[2];
  ++hyp;
  float prev1_depth, prev2_depth;
  if (row < 2) {
    prev1_depth = *(depth_map + col);
    prev2_depth = prev1_depth;
  else {
    prev1_depth = *((__global float*)((__global char*)depth_map + depth_map_pitch * (row-1)) + col);
    prev2_depth = *((__global float*)((__global char*)depth_map + depth_map_pitch * (row-2)) + col);
  SET_HYPOTHESIS(hyp, 0) = 2*prev1_depth - prev2_depth;
  SET_HYPOTHESIS(hyp, 1) = prev_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = prev_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = prev_param_state.normal[2];
  ++hyp;
 #if HYPOTHESIS_NUMBER > 1
  const float fuzzy5 = GenerateRandomDepth(curr_param_state.depth - 0.005f, curr_param_state.depth + 0.005f, rand_state);
  SET_HYPOTHESIS(hyp, 0) = fuzzy5;
  SET_HYPOTHESIS(hyp, 1) = curr_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = curr_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = curr_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = fuzzy5;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt1_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt1_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt1_param_state.normal[2];
  ++hyp;
  const float fuzzy10 = GenerateRandomDepth(curr_param_state.depth - 0.01f, curr_param_state.depth + 0.01f, rand_state);
  SET_HYPOTHESIS(hyp, 0) = fuzzy10;
  SET_HYPOTHESIS(hyp, 1) = curr_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = curr_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = curr_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = fuzzy10;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt1_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt1_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt1_param_state.normal[2];
  ++hyp;
  const float fuzzy20 = GenerateRandomDepth(curr_param_state.depth - 0.02f, curr_param_state.depth + 0.02f, rand_state);
  SET_HYPOTHESIS(hyp, 0) = fuzzy20;
  SET_HYPOTHESIS(hyp, 1) = curr_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = curr_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = curr_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = fuzzy20;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt1_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt1_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt1_param_state.normal[2];
  ++hyp;
  *(rand_state_map + col) = rand_state;
 #endif
  SET_HYPOTHESIS(hyp, 0) = scaledfilt1_param_state.depth;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt1_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt1_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt1_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = curr_param_state.depth;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt1_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt1_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt1_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = scaledfilt2_param_state.depth;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt2_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt2_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt2_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = curr_param_state.depth;
  SET_HYPOTHESIS(hyp, 1) = scaledfilt2_param_state.normal[0];
  SET_HYPOTHESIS(hyp, 2) = scaledfilt2_param_state.normal[1];
  SET_HYPOTHESIS(hyp, 3) = scaledfilt2_param_state.normal[2];
  ++hyp;
  SET_HYPOTHESIS(hyp, 0) = 0.0f;
  SET_HYPOTHESIS(hyp, 1) = 0.0f;
  SET_HYPOTHESIS(hyp, 2) = 0.0f;
  SET_HYPOTHESIS(hyp, 3) = 0.0f;
  ++hyp;
 #endif
 #undef SET_HYPOTHESIS
  float likelihood_computer_cos_min_triangulation_angle_ = cos(min_triangulation_angle);
  float likelihood_computer_inv_incident_angle_sigma_square_ = -0.5f / (incident_angle_sigma * incident_angle_sigma) ;
  float likelihood_computer_inv_ncc_sigma_square_ = -0.5f / (ncc_sigma * ncc_sigma);
  float likelihood_computer_ncc_norm_factor_ = likelihood_computer_ComputeNCCCostNormFactor(ncc_sigma);
  __global float* forward_message = &global_workspace[col * global_workspace_height];
  __global float* sampling_probs = &global_workspace[global_workspace_width *
                                           global_workspace_height +
                                           col * global_workspace_height];
  float point[3];
  ComputePointAtDepth(row, col, curr_param_state.depth, point, ref_inv_K);
  for (int image_idx = 0; image_idx < cost_map_depth; ++image_idx) {
    const float cost = *((__global float*)((__global char*)cost_map + cost_map_pitch * (image_idx * cost_map_height + row)) + col);
    const float alpha = likelihood_computer_ComputeForwardMessage(cost, forward_message[image_idx], likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
    const float beta = *((__global float*)((__global char*)sel_prob_map + sel_prob_map_pitch * (image_idx * sel_prob_map_height + row)) + col);
    const float prev_prob = *((__global float*)((__global char*)prev_sel_prob_map + prev_sel_prob_map_pitch * (image_idx * prev_sel_prob_map_height + row)) + col);
    const float sel_prob = likelihood_computer_ComputeSelProb(alpha, beta, prev_prob, prev_sel_prob_weight);
    float cos_triangulation_angle;
    float cos_incident_angle;
    ComputeViewingAngles(point, curr_param_state.normal, image_idx, cos_triangulation_angle, &cos_incident_angle, poses_images);
    const float tri_prob =
        likelihood_computer_ComputeTriProb(cos_triangulation_angle, likelihood_computer_cos_min_triangulation_angle_);
    const float inc_prob =
        likelihood_computer_ComputeIncProb(cos_incident_angle, likelihood_computer_inv_incident_angle_sigma_square_);
    float H[9];
    ComposeHomography(poses_images, image_idx, row, col, curr_param_state.depth, curr_param_state.normal, H, ref_inv_K);
    const float res_prob =
        likelihood_computer_ComputeResolutionProb(H, row, col);
    sampling_probs[image_idx] = sel_prob * tri_prob * inc_prob * res_prob;
 __constant sampler_t RefImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                           CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __constant sampler_t SrcImageSampler = CLK_NORMALIZED_COORDS_FALSE |
                                           CLK_ADDRESS_CLAMP | CLK_FILTER_LINEAR;
 #ifdef NO_LOCAL_MEMORY
 __kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float *hypotheses_workspace, __global float *pcc_costs, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, image2d_array_t poses_images, __constant float *ref_inv_K)
  const int col = get_global_id(0);
  if (col >= width)
    return;
  const int src_image_idx = get_global_id(1);
  const int num_src_images = get_global_size(1);
  const int hypothesis = get_global_id(2);
  const int num_hypotheses = get_global_size(2);
  float depth = hypotheses_workspace[hypothesis*workspaceWidth + col];
  float normal[3];
  for (int i=0; i<3; ++i)
    normal[i] = hypotheses_workspace[(i+1)*workspaceWidth*num_hypotheses + hypothesis*workspaceWidth + col];
  float tform[9];
  ComposeHomography(poses_images, src_image_idx, row, col, depth, normal, tform, ref_inv_K);
  float x_step[3], y_step[3];
  for (int i=0; i<3; ++i) {
      x_step[i] = KWINDOWSTEP * tform[i*3];
      y_step[i] = KWINDOWSTEP * tform[i*3+1];
    
  const int row_start = row - KWINDOWRADIUS;
  const int col_start = col - KWINDOWRADIUS;
  float col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
  float row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
  float z = tform[6] * col_start + tform[7] * row_start + tform[8];
  float base_col_src = col_src;
  float base_row_src = row_src;
  float base_z = z;
  const float ref_center_color =
      read_imagef(ref_image, RefImageSampler, (int4)(col, row, 0, 0)).x;
  const float ref_color_sum = *((__global float*)((__global char*)ref_sum_image + ref_sum_pitch * row) + col);
  const float ref_color_squared_sum = *((__global float*)((__global char*)ref_squared_sum_image + ref_squared_sum_pitch * row) + col);
  float src_color_sum = 0.0f;
  float src_color_squared_sum = 0.0f;
  float src_ref_color_sum = 0.0f;
  float bilateral_weight_sum = 0.0f;
  for (int r = -KWINDOWRADIUS; r <= KWINDOWRADIUS; r += KWINDOWSTEP) {
    for (int c = -KWINDOWRADIUS; c <= KWINDOWRADIUS; c += KWINDOWSTEP) {
      const float inv_z = 1.0f / z;
      const float norm_col_src = inv_z * col_src + 0.5f;
      const float norm_row_src = inv_z * row_src + 0.5f;
      const float ref_color = read_imagef(ref_image, RefImageSampler, (int4)(col+c, row+r, 0, 0)).x;
      const float src_color = read_imagef(src_images, SrcImageSampler, (float4)(norm_col_src, norm_row_src, src_image_idx, 0)).x;
      const float spatial_dist_squared = r*r + c*c;
      const float color_dist = ref_center_color - ref_color;
      const float bilateral_weight = exp(-spatial_dist_squared * spatial_normalization_ -
                                       color_dist * color_dist * color_normalization_);
      const float bilateral_weight_src = bilateral_weight * src_color;
      src_color_sum += bilateral_weight_src;
      src_color_squared_sum += bilateral_weight_src * src_color;
      src_ref_color_sum += bilateral_weight_src * ref_color;
      bilateral_weight_sum += bilateral_weight;
      col_src += x_step[0];
      row_src += x_step[1];
      z += x_step[2];
    }
    base_col_src += y_step[0];
    base_row_src += y_step[1];
    base_z += y_step[2];
    col_src = base_col_src;
    row_src = base_row_src;
    z = base_z;
  const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
  src_color_sum *= inv_bilateral_weight_sum;
  src_color_squared_sum *= inv_bilateral_weight_sum;
  src_ref_color_sum *= inv_bilateral_weight_sum;
  const float ref_color_var =
      ref_color_squared_sum - ref_color_sum * ref_color_sum;
  const float src_color_var =
      src_color_squared_sum - src_color_sum * src_color_sum;
  const float kMinVar = 1e-5f;
  const float kMaxCost = 2.0f;
  float retVal;
  if (ref_color_var < kMinVar || src_color_var < kMinVar) {
    retVal = kMaxCost;
  } else {
    const float src_ref_color_covar =
        src_ref_color_sum - ref_color_sum * src_color_sum;
    const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
    retVal = max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar / src_ref_color_var));
  pcc_costs[workspaceWidth*num_src_images*hypothesis + workspaceWidth*src_image_idx + col] = retVal;
 #else
 #if KWINDOWRADIUS > LOCAL_MEM_WORK_GROUP_SIZE/2
  #define LOCAL_MEM_WORK_GROUP_SHIFT LOCAL_MEM_WORK_GROUP_SIZE
 #else
  #define LOCAL_MEM_WORK_GROUP_SHIFT LOCAL_MEM_WORK_GROUP_SIZE/2
 #endif
 __kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float *hypotheses_workspace, __global float *pcc_costs, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, image2d_array_t poses_images, __constant float *ref_inv_K, __local float *local_ref_image)
 {
  const int col = get_global_id(0);
  const int thread = get_local_id(0);
  const bool valid = col < width;
  const int src_image_idx = get_global_id(1);
  const int num_src_images = get_global_size(1);
  const int hypothesis = get_global_id(2);
  const int num_hypotheses = get_global_size(2);
  float col_src, row_src, z;
  float base_col_src, base_row_src, base_z;
  float x_step[3], y_step[3];
  float ref_center_color, ref_color_sum, ref_color_squared_sum;
  float src_color_sum = 0.0f;
  float src_color_squared_sum = 0.0f;
  float src_ref_color_sum = 0.0f;
  float bilateral_weight_sum = 0.0f;
  if (valid) {
    float depth = hypotheses_workspace[hypothesis*workspaceWidth + col];
    float normal[3];
    for (int i=0; i<3; ++i)
      normal[i] = hypotheses_workspace[(i+1)*workspaceWidth*num_hypotheses + hypothesis*workspaceWidth + col];
    float tform[9];
    ComposeHomography(poses_images, src_image_idx, row, col, depth, normal, tform, ref_inv_K);
    for (int i=0; i<3; ++i) {
        x_step[i] = KWINDOWSTEP * tform[i*3];
        y_step[i] = KWINDOWSTEP * tform[i*3+1];
    }
    const int row_start = row - KWINDOWRADIUS;
    const int col_start = col - KWINDOWRADIUS;
    col_src = tform[0] * col_start + tform[1] * row_start + tform[2];
    row_src = tform[3] * col_start + tform[4] * row_start + tform[5];
    z = tform[6] * col_start + tform[7] * row_start + tform[8];
    base_col_src = col_src;
    base_row_src = row_src;
    base_z = z;
    ref_center_color = read_imagef(ref_image, RefImageSampler, (int4)(col, row, 0, 0)).x;
    ref_color_sum = *((__global float*)((__global char*)ref_sum_image + ref_sum_pitch * row) + col);
    ref_color_squared_sum = *((__global float*)((__global char*)ref_squared_sum_image + ref_squared_sum_pitch * row) + col);
  }
  for (int r = -KWINDOWRADIUS; r <= KWINDOWRADIUS; r += KWINDOWSTEP) {
    barrier(CLK_LOCAL_MEM_FENCE);
    local_ref_image[thread] =
      read_imagef(ref_image, RefImageSampler, (int4)(col-LOCAL_MEM_WORK_GROUP_SHIFT, row+r, 0, 0)).x;
    local_ref_image[thread+2*LOCAL_MEM_WORK_GROUP_SHIFT] =
      read_imagef(ref_image, RefImageSampler, (int4)(col+LOCAL_MEM_WORK_GROUP_SHIFT, row+r, 0, 0)).x;
 #if KWINDOWRADIUS > LOCAL_MEM_WORK_GROUP_SIZE/2
    local_ref_image[thread+LOCAL_MEM_WORK_GROUP_SIZE] =
      read_imagef(ref_image, RefImageSampler, (int4)(col, row+r, 0, 0)).x;
 #endif
    barrier(CLK_LOCAL_MEM_FENCE);
    if (valid) {
      for (int c = -KWINDOWRADIUS; c <= KWINDOWRADIUS; c += KWINDOWSTEP) {
        const float inv_z = 1.0f / z;
        const float norm_col_src = inv_z * col_src + 0.5f;
        const float norm_row_src = inv_z * row_src + 0.5f;
        const float ref_color = local_ref_image[thread+LOCAL_MEM_WORK_GROUP_SHIFT+c];
        const float src_color = read_imagef(src_images, SrcImageSampler, (float4)(norm_col_src, norm_row_src, src_image_idx, 0)).x;
        const float spatial_dist_squared = r*r + c*c;
        const float color_dist = ref_center_color - ref_color;
        const float bilateral_weight = exp(-spatial_dist_squared * spatial_normalization_ - color_dist * color_dist * color_normalization_);
        const float bilateral_weight_src = bilateral_weight * src_color;
        src_color_sum += bilateral_weight_src;
        src_color_squared_sum += bilateral_weight_src * src_color;
        src_ref_color_sum += bilateral_weight_src * ref_color;
        bilateral_weight_sum += bilateral_weight;
        col_src += x_step[0];
        row_src += x_step[1];
        z += x_step[2];
      }
      base_col_src += y_step[0];
      base_row_src += y_step[1];
      base_z += y_step[2];
      col_src = base_col_src;
      row_src = base_row_src;
      z = base_z;
    }
  }
  if (!valid)
    return;
  const float inv_bilateral_weight_sum = 1.0f / bilateral_weight_sum;
  src_color_sum *= inv_bilateral_weight_sum;
  src_color_squared_sum *= inv_bilateral_weight_sum;
  src_ref_color_sum *= inv_bilateral_weight_sum;
  const float ref_color_var =
      ref_color_squared_sum - ref_color_sum * ref_color_sum;
  const float src_color_var =
      src_color_squared_sum - src_color_sum * src_color_sum;
  const float kMinVar = 1e-5f;
  const float kMaxCost = 2.0f;
  float retVal;
  if (ref_color_var < kMinVar || src_color_var < kMinVar) {
    retVal = kMaxCost;
  } else {
    const float src_ref_color_covar =
        src_ref_color_sum - ref_color_sum * src_color_sum;
    const float src_ref_color_var = sqrt(ref_color_var * src_color_var);
    retVal = max(0.0f, min(kMaxCost, 1.0f - src_ref_color_covar / src_ref_color_var));
  }
  pcc_costs[workspaceWidth*num_src_images*hypothesis + workspaceWidth*src_image_idx + col] = retVal;
 }
 #endif
 __kernel void MinimumCostKernel(int row, int width, int num_src_images, int num_hypotheses, int kGeomConsistencyTerm, __global float *global_workspace, int workspaceWidth, __global float *pcc_costs, __global float *hypotheses_workspace, __global float *cost_map, int cost_map_height, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float *prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float prev_sel_prob_weight, float ncc_sigma, float geom_consistency_regularizer, float geom_consistency_max_cost, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float *ref_inv_K,
 #ifdef HYPOTHESES_STATISTICS
                                __constant float *ref_K, __global unsigned int *hypotheses_stats)
 #else
                                __constant float *ref_K)
 #endif
 {
  const int col = get_global_id(0);
  if (col >= width)
    return;
  float likelihood_computer_inv_ncc_sigma_square_ = -0.5f / (ncc_sigma * ncc_sigma);
  float likelihood_computer_ncc_norm_factor_ = likelihood_computer_ComputeNCCCostNormFactor(ncc_sigma);
  __global float* forward_message = &global_workspace[col * num_src_images];
  __global float* sampling_probs = &global_workspace[workspaceWidth * num_src_images + col * num_src_images];
  float best_cost = 0.0f;
  int min_cost_idx = 0;
  for (int a=0; a<num_src_images; ++a) {
    const float sampling_weight = sampling_probs[a];
    best_cost += *((__global float*)((__global char*)cost_map + cost_map_pitch * (a * cost_map_height + row)) + col) * sampling_weight;
    if (kGeomConsistencyTerm) {
      float currDepth = *((__global float*)((__global char*)depth_map + depth_map_pitch * row) + col);
      best_cost += geom_consistency_regularizer *
                  ComputeGeomConsistencyCost(row, col, currDepth, a, geom_consistency_max_cost, poses_images, src_depth_map_images, ref_inv_K, ref_K) * sampling_weight;
    }
  }
  for (int i=1; i<num_hypotheses; ++i) {
    float hyp_cost = 0.0f;
    for (int a=0; a<num_src_images; ++a) {
      const float sampling_weight = sampling_probs[a];
      hyp_cost += pcc_costs[workspaceWidth*num_src_images*(i-1) + workspaceWidth*a + col] * sampling_weight;
      if (kGeomConsistencyTerm) {
        float hypDepth = hypotheses_workspace[(i-1)*workspaceWidth + col];
        hyp_cost += geom_consistency_regularizer *
                    ComputeGeomConsistencyCost(row, col, hypDepth, a, geom_consistency_max_cost, poses_images, src_depth_map_images, ref_inv_K, ref_K) * sampling_weight;
      }
    }
    if (hyp_cost <= best_cost) {
       best_cost = hyp_cost;
       min_cost_idx = i;
    }
  }
 #ifdef HYPOTHESES_STATISTICS
  hypotheses_stats[workspaceWidth*min_cost_idx + col] += 1;
 #endif
  if (min_cost_idx > 0) {
    *((__global float*)((__global char*)depth_map + depth_map_pitch * row) + col) = hypotheses_workspace[(min_cost_idx-1)*workspaceWidth + col];
    for (size_t slice = 0; slice < 3; ++slice) {
      *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col) =
           hypotheses_workspace[(slice+1)*workspaceWidth*(num_hypotheses-1) + (min_cost_idx-1)*workspaceWidth + col];
    }
  }
  for (int image_idx = 0; image_idx < num_src_images; ++image_idx)
  {
    float cost;
    if (min_cost_idx == 0) {
      cost = *((__global float*)((__global char*)cost_map + cost_map_pitch * (image_idx * cost_map_height + row)) + col);
    }
    else {
      cost = pcc_costs[workspaceWidth*num_src_images*(min_cost_idx-1) + workspaceWidth*image_idx + col];
      *((__global float*)((__global char*)cost_map + cost_map_pitch * (image_idx * cost_map_height + row)) + col) = cost;
    }
    const float alpha = likelihood_computer_ComputeForwardMessage(cost, forward_message[image_idx], likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
    const float beta = *((__global float*)((__global char*)sel_prob_map + sel_prob_map_pitch * (image_idx * sel_prob_map_height + row)) + col);
    const float prev_prob = *((__global float*)((__global char*)prev_sel_prob_map + prev_sel_prob_map_pitch * (image_idx * prev_sel_prob_map_height + row)) + col);
    const float prob = likelihood_computer_ComputeSelProb(alpha, beta, prev_prob, prev_sel_prob_weight);
    forward_message[image_idx] = alpha;
    *((__global float*)((__global char*)sel_prob_map + sel_prob_map_pitch * (image_idx * sel_prob_map_height + row)) + col) = prob;
  }
 }
 __kernel void ConsistencyFilterKernel(int width, __global float *depth_map, __global float *normal_map, __global float *sel_prob_map, int map_pitch, int map_height, int num_src_images, int kFilterConsistencyMode, __global uchar *consistency_mask, int consistency_mask_pitch, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float *ref_inv_K, __constant float *ref_K, float ncc_sigma, float filter_min_ncc, float filter_min_triangulation_angle, float geom_consistency_max_cost, float filter_geom_consistency_max_cost, int filter_min_num_consistent)
 {
  const int col = get_global_id(0);
  const int row = get_global_id(1);
  if (col >= width)
    return;
  int num_consistent = 0;
  const float best_depth = *((__global float*)((__global char*)depth_map + map_pitch * row) + col);
  float best_normal[3];
  for (int i=0; i<3; ++i)
    best_normal[i] = *((__global float*)((__global char*)normal_map + map_pitch * (i * map_height + row)) + col);
  float best_point[3];
  ComputePointAtDepth(row, col, best_depth, best_point, ref_inv_K);
  const float likelihood_computer_inv_ncc_sigma_square_ = -0.5f / (ncc_sigma * ncc_sigma);
  const float likelihood_computer_ncc_norm_factor_ = likelihood_computer_ComputeNCCCostNormFactor(ncc_sigma);
  const float min_ncc_prob = 
      likelihood_computer_ComputeNCCProb(1.0f - filter_min_ncc, likelihood_computer_inv_ncc_sigma_square_, likelihood_computer_ncc_norm_factor_);
  const float cos_min_triangulation_angle =
      cos(filter_min_triangulation_angle);
  for (int image_idx = 0; image_idx < num_src_images; ++image_idx) {
    float cos_triangulation_angle;
    float cos_incident_angle;
    ComputeViewingAngles(best_point, best_normal, image_idx, cos_triangulation_angle, &cos_incident_angle, poses_images);
    if (cos_triangulation_angle > cos_min_triangulation_angle ||
        cos_incident_angle <= 0.0f) {
      continue;
    }
    if (!(kFilterConsistencyMode & 0x02)) {
      if (*((__global float*)((__global char*)sel_prob_map + map_pitch * (image_idx * map_height + row)) + col) >= min_ncc_prob) {
         *(consistency_mask + consistency_mask_pitch * (image_idx * map_height + row) + col) = 1;
         num_consistent += 1;
      }
    } else if (!(kFilterConsistencyMode & 0x01)) {
      if (ComputeGeomConsistencyCost(row, col, best_depth, image_idx, geom_consistency_max_cost, poses_images, src_depth_map_images, ref_inv_K, ref_K) <= filter_geom_consistency_max_cost) {
        *(consistency_mask + consistency_mask_pitch * (image_idx * map_height + row) + col) = 1;
        num_consistent += 1;
      }
    } else {
      if (*((__global float*)((__global char*)sel_prob_map + map_pitch * (image_idx * map_height + row)) + col) >= min_ncc_prob &&
          ComputeGeomConsistencyCost(row, col, best_depth, image_idx, geom_consistency_max_cost, poses_images, src_depth_map_images, ref_inv_K, ref_K) <= filter_geom_consistency_max_cost) {
        *(consistency_mask + consistency_mask_pitch * (image_idx * map_height + row) + col) = 1;
        num_consistent += 1;
      }
    }
  }
  if (num_consistent < filter_min_num_consistent) {
    const float kFilterValue = 0.0f;
    *((__global float*)((__global char*)depth_map + map_pitch * row) + col) = kFilterValue;
    *((__global float*)((__global char*)normal_map + map_pitch * (0 * map_height + row)) + col) = kFilterValue;
    *((__global float*)((__global char*)normal_map + map_pitch * (1 * map_height + row)) + col) = kFilterValue;
    *((__global float*)((__global char*)normal_map + map_pitch * (2 * map_height + row)) + col) = kFilterValue;
    for (int image_idx = 0; image_idx < num_src_images; ++image_idx) {
      *(consistency_mask + consistency_mask_pitch * (image_idx * map_height + row) + col) = 0;
    }
  }
 }
 #ifdef NO_LOCAL_MEMORY
 __kernel void RotateFloatKernel(__global float *input_data, int input_offset, __global float *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch)
 {
  int input_x = get_global_id(0);
  int input_y = get_global_id(1);
  if (input_x >= width || input_y >= height) {
    return;
  int output_x = input_y;
  int output_y = width - 1 - input_x;
  *((__global float *)((__global uchar *)output_data + output_offset + output_y * output_pitch) + output_x) =
      *((__global float *)((__global uchar *)input_data + input_offset + input_y * input_pitch) + input_x);
 }
 __kernel void RotateUint8Kernel(__global uchar *input_data, int input_offset, __global uchar *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch)
 {
  int input_x = get_global_id(0);
  int input_y = get_global_id(1);
  if (input_x < width && input_y < height) {
     int output_x = input_y;
     int output_y = width - 1 - input_x;
     *(output_data + output_offset + output_y * output_pitch + output_x) =
             *(input_data + input_offset + input_y * input_pitch + input_x);
  }
 }
 #else
 #define ROTATE_TILE_DIM 16
 #define ROTATE_WORKGROUP_ROWS 4
 __kernel void RotateFloatKernel(__global float * input_data, int input_offset, __global float * output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch)
 {
  __local float tile[ROTATE_TILE_DIM][ROTATE_TILE_DIM+1];
  int input_x = get_global_id(0);
  int x = get_local_id(0);
  int y = get_local_id(1);
  int yp = get_group_id(1) * ROTATE_TILE_DIM + y;
  for (int j=0; j < ROTATE_TILE_DIM; j += ROTATE_WORKGROUP_ROWS)
    if (input_x < width && yp+j < height)
      tile[y+j][x] = *((__global float *)((__global uchar *)input_data + input_offset + (yp + j) * input_pitch) + input_x);
  barrier(CLK_LOCAL_MEM_FENCE);
  int output_y = width - 1 - (input_x/ROTATE_TILE_DIM * ROTATE_TILE_DIM) - y;
  for (int j=0; j < ROTATE_TILE_DIM; j += ROTATE_WORKGROUP_ROWS)
    int output_x = ((yp+j)/ROTATE_TILE_DIM)*ROTATE_TILE_DIM + x;
    if (output_x < height && output_y-j >= 0)
      *((__global float *)((__global uchar *)output_data + output_offset + (output_y - j) * output_pitch) + output_x) = tile[x][y+j];
 }
 __kernel void RotateUint8Kernel(__global uchar *input_data, int input_offset, __global uchar *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch)
 {
  __local uchar tile[ROTATE_TILE_DIM][ROTATE_TILE_DIM+1];
  int input_x = get_global_id(0);
  int x = get_local_id(0);
  int y = get_local_id(1);
  int yp = get_group_id(1) * ROTATE_TILE_DIM + y;
  for (int j=0; j < ROTATE_TILE_DIM; j += ROTATE_WORKGROUP_ROWS)
    if (input_x < width && yp+j < height)
      tile[y+j][x] = *(input_data + input_offset + (yp + j) * input_pitch + input_x);
  barrier(CLK_LOCAL_MEM_FENCE);
  int output_y = width - 1 - (input_x/ROTATE_TILE_DIM * ROTATE_TILE_DIM) - y;
  for (int j=0; j < ROTATE_TILE_DIM; j += ROTATE_WORKGROUP_ROWS)
  {
    int output_x = ((yp+j)/ROTATE_TILE_DIM)*ROTATE_TILE_DIM + x;
    if (output_x < height && output_y-j >= 0)
      *(output_data + output_offset + (output_y - j) * output_pitch + output_x) = tile[x][y+j];
  }
 }
 #endif
 __kernel void FillWithRandomNumbersKernel(__global float* output, __global clrandState * rand_state_map, float min_value, float max_value, int random_state_width, int random_state_height, int random_state_pitch, int output_height, int output_depth, int output_pitch) {
  const uint row = get_global_id(1);
  const uint col = get_global_id(0);
  if (col < random_state_width && row < random_state_height) {
     clrandState rand_state = *((__global clrandState*)((__global char*)rand_state_map + random_state_pitch * row) + col);
     for (size_t slice = 0; slice < output_depth; ++slice) {
        float random_value = clrand_uniform(&rand_state) * (max_value - min_value) + min_value;
        *((__global float*)((__global char*)output + output_pitch * (slice * output_height + row)) + col) = random_value;
     }
     *((__global clrandState*)((__global char*)rand_state_map + random_state_pitch * row) + col) = rand_state;
  }
 }
 __kernel void InitNormalMapKernel(__global float* normal_map, __global clrandState * rand_state_map, int normal_map_width, int normal_map_height, int normal_map_depth, int normal_map_pitch, int rand_state_map_pitch, __constant float * ref_inv_K) {
  const uint row = get_global_id(1);
  const uint col = get_global_id(0);
  if (col < normal_map_width && row < normal_map_height) {
    clrandState rand_state = *((__global clrandState*)((__global char*)rand_state_map + rand_state_map_pitch * row) + col);
    float normal[3];
    GenerateRandomNormal(row, col, &rand_state, normal, ref_inv_K);
    for (size_t slice = 0; slice < normal_map_depth; ++slice) {
      *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col) = normal[slice];
    }
    *((__global clrandState*)((__global char*)rand_state_map + rand_state_map_pitch * row) + col) = rand_state;
 }
 __constant sampler_t FilterSampler = CLK_NORMALIZED_COORDS_FALSE |
                                     CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void FilterKernel(image2d_array_t image_array, __global uchar *image, int pitch_image, __global float *sum_image, int pitch_sum_image, __global float *squared_sum_image, int pitch_squared_sum_image, int window_radius, int window_step, float sigma_spatial, float sigma_color) {
  const int row = get_global_id(1);
  const int col = get_global_id(0);
  const float spatial_normalization_ = (1.0f / (2.0f * sigma_spatial * sigma_spatial));
  const float color_normalization_ = (1.0f / (2.0f * sigma_color * sigma_color));
  const float center_color = read_imagef(image_array, FilterSampler, int4)(col, row, 0, 0)).x;
  float color_sum = 0.0f;
  float color_squared_sum = 0.0f;
  float bilateral_weight_sum = 0.0f;
  for (int window_row = -window_radius; window_row <= window_radius;
       window_row += window_step) {
    for (int window_col = -window_radius; window_col <= window_radius;
         window_col += window_step) {
      const float color = read_imagef(image_array, FilterSampler, int4)(col + window_col, row + window_row, 0, 0)).x;
      const float spatial_dist_squared = window_row * window_row + window_col * window_col;
      const float color_dist = center_color - color;
      const float bilateral_weight = exp(-spatial_dist_squared * spatial_normalization_ -
                                         color_dist * color_dist * color_normalization_);
      color_sum += bilateral_weight * color;
      color_squared_sum += bilateral_weight * color * color;
      bilateral_weight_sum += bilateral_weight;
    }
  }
  color_sum /= bilateral_weight_sum;
  color_squared_sum /= bilateral_weight_sum;
  *(image + pitch_image * row + col) = (uchar)(255.0f * center_color);
  *((__global float*)((__global char*)sum_image + pitch_sum_image * row) + col) = color_sum;
  *((__global float*)((__global char*)squared_sum_image + pitch_squared_sum_image * row) + col) = color_squared_sum;
 }
 __kernel void RotateNormalMapKernel(__global float * normal_map, int normal_map_width, int normal_map_height, int normal_map_depth, int normal_map_pitch) {
  const int row = get_global_id(1);
  const int col = get_global_id(0);
  if (col < normal_map_width && row < normal_map_height) {
    float normal[3];
    for (size_t slice = 0; slice < normal_map_depth; ++slice) {
      normal[slice] = *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col);
    }
    float rotated_normal[3];
    rotated_normal[0] = normal[1];
    rotated_normal[1] = -normal[0];
    rotated_normal[2] = normal[2];
    for (size_t slice = 0; slice < normal_map_depth; ++slice) {
      *((__global float*)((__global char*)normal_map + normal_map_pitch * (slice * normal_map_height + row)) + col) = rotated_normal[slice];
    }
  }
 }
 __kernel void CopyFromGpuMatCLKernel(__global uchar * input_array, __write_only image2d_array_t output_array, int pitch, int width, int height) {
  const int row = get_global_id(1);
  const int col = get_global_id(0);
  int4 pos = (int4)(col, row, 0, 0);
  float4 pix = (float4)((*(input_array + pitch * row + col))/255.0f);
  if (col < width && row < height) {
    write_imagef(output_array, pos, pix);
  }
 }
diff --git a/kernels.h b/kernels.h
 __kernel void FindBestMatchOneWay_Kernel(__global int *result, __global VECTOR_TYPE *des1, __global VECTOR_TYPE *des2, int num1, int num2, float distmax, float ratiomax);
 __kernel void FindGuidedBestMatchOneWay_Kernel(__global int *result, __global VECTOR_TYPE *des1, __global VECTOR_TYPE *des2, __global float *loc1, __global float *loc2, int num1, int num2, int guided,    // 0 = Fundamental matrix, first pass;
 __kernel void ComputeInitialCostKernel(__global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_depth, int normal_map_pitch, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, float sigma_spatial, float sigma_color, image2d_array_t ref_image, ;
 __kernel void InitRandomStateKernel(__global clrandState *output, int width, int height, uint pitch);
 __kernel void ComputeBackwardMessagesKernel(__global float *global_workspace, int global_workspace_height, __global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, float ncc_sigma);
 __kernel void HypothesesAndProbsKernel(int row, __global float *global_workspace, int global_workspace_width, int global_workspace_height, __global float *hypotheses_workspace, int hypotheses_workspace_height, __global clrandState *rand_state_map, __global float *cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float *prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float min_triangulation_angle, float incident_angle_sigma, float ncc_sigma, float perturbation, float prev_sel_prob_weight, image2d_array_t poses_images, ;
 __kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float *hypotheses_workspace, __global float *pcc_costs, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, ;
 __kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float *hypotheses_workspace, __global float *pcc_costs, __global float *ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, ;
 __kernel void MinimumCostKernel(int row, int width, int num_src_images, int num_hypotheses, int kGeomConsistencyTerm, __global float *global_workspace, int workspaceWidth, __global float *pcc_costs, __global float *hypotheses_workspace, __global float *cost_map, int cost_map_height, int cost_map_pitch, __global float *depth_map, int depth_map_pitch, __global float *normal_map, int normal_map_height, int normal_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float *prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float prev_sel_prob_weight, float ncc_sigma, float geom_consistency_regularizer, float geom_consistency_max_cost, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float *ref_inv_K, ifdef HYPOTHESES_STATISTICS;
 __kernel void ConsistencyFilterKernel(int width, __global float *depth_map, __global float *normal_map, __global float *sel_prob_map, int map_pitch, int map_height, int num_src_images, int kFilterConsistencyMode, __global uchar *consistency_mask, int consistency_mask_pitch, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float *ref_inv_K, __constant float *ref_K, float ncc_sigma, float filter_min_ncc, float filter_min_triangulation_angle, float geom_consistency_max_cost, float filter_geom_consistency_max_cost, int filter_min_num_consistent);
 __kernel void RotateFloatKernel(__global float *input_data, int input_offset, __global float *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
 __kernel void RotateUint8Kernel(__global uchar *input_data, int input_offset, __global uchar *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
 __kernel void RotateFloatKernel(__global float * input_data, int input_offset, __global float * output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
 __kernel void RotateUint8Kernel(__global uchar *input_data, int input_offset, __global uchar *output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
 __kernel void FillWithRandomNumbersKernel(__global float* output, __global clrandState * rand_state_map, float min_value, float max_value, int random_state_width, int random_state_height, int random_state_pitch, int output_height, int output_depth, int output_pitch) {;
 __kernel void InitNormalMapKernel(__global float* normal_map, __global clrandState * rand_state_map, int normal_map_width, int normal_map_height, ;
 __kernel void FilterKernel(image2d_array_t image_array, __global uchar *image, int pitch_image, __global float *sum_image, int pitch_sum_image, __global float *squared_sum_image, int pitch_squared_sum_image, int window_radius, int window_step, float sigma_spatial, float sigma_color) {;
 __kernel void RotateNormalMapKernel(__global float * normal_map, int normal_map_width, int normal_map_height, int normal_map_depth, int normal_map_pitch) {;
 __kernel void CopyFromGpuMatCLKernel(__global uchar * input_array, __write_only image2d_array_t output_array, int pitch, int width, int height) {;
diff --git a/zz - fragments.glsl b/zz - fragments.glsl
 #version 150
 uniform mat4 u_pmv_matrix;
 in vec3 a_pos;
 in vec4 a_color;
 out vec4 v_pos;
 out vec4 v_color;
 void main(void) {
  v_pos = u_pmv_matrix * vec4(a_pos, 1);
  v_color = a_color;
 [#version 150
 in vec4 v_color;
 out vec4 f_color;
 void main(void) {
  f_color = v_color;
 [#version 150
 in vec4 v_color;
 out vec4 f_color;
 void main(void) {
  f_color = v_color;
 #version 150
 layout(lines) in;
 layout(triangle_strip, max_vertices = 4) out;
 uniform vec2 u_inv_viewport;
 uniform float u_line_width;
 in vec4 v_pos[2];
 in vec4 v_color[2];
 out vec4 g_color;
 void main() {
  vec2 dir = normalize(v_pos[1].xy / v_pos[1].w - v_pos[0].xy / v_pos[0].w);
  vec2 normal_dir = vec2(-dir.y, dir.x);
  vec2 offset = (vec2(u_line_width) * u_inv_viewport) * normal_dir;
  gl_Position = vec4(v_pos[0].xy + offset * v_pos[0].w, v_pos[0].z, v_pos[0].w);
  g_color = v_color[0];
  EmitVertex();
  gl_Position = vec4(v_pos[1].xy + offset * v_pos[1].w, v_pos[1].z, v_pos[1].w);
  g_color = v_color[1];
  EmitVertex();
  gl_Position = vec4(v_pos[0].xy - offset * v_pos[0].w, v_pos[0].z, v_pos[0].w);
  g_color = v_color[0];
  EmitVertex();
  gl_Position = vec4(v_pos[1].xy - offset * v_pos[1].w, v_pos[1].z, v_pos[1].w);
  g_color = v_color[1];
  EmitVertex();
  EndPrimitive();
 #version 150
 uniform float u_point_size;
 uniform mat4 u_pmv_matrix;
 in vec3 a_position;
 in vec4 a_color;
 out vec4 v_color;
 void main(void) {
  gl_Position = u_pmv_matrix * vec4(a_position, 1);
  gl_PointSize = u_point_size;
  v_color = a_color;
 #version 150
 uniform mat4 u_pmv_matrix;
 in vec3 a_position;
 in vec4 a_color;
 out vec4 v_color;
 void main(void) {
  gl_Position = u_pmv_matrix * vec4(a_position, 1);
  v_color = a_color;
 [#version 150
 in vec4 g_color;
 out vec4 f_color;
 void main(void) {
  f_color = g_color;





 #pragma optionNV(ifcvt none)
 #pragma optionNV(unroll all)
   vec4 val = vec4(0.0, 0.0, 0.0, 0.0), data1, buf;
   vec2 index = gl_FragCoord.yx; 
   vec2 stripe_size = size.xy * SIFT_PER_STRIPE;
 vec2 temp_div1 = index / stripe_size;
   vec2 stripe_index = floor(temp_div1);
   index = floor(stripe_size * (temp_div1 - stripe_index));
 vec2 temp_div2 = index * vec2(1.0 / float(SIFT_PER_STRIPE));
 vec2 temp_floor2 = floor(temp_div2);
   vec2 index_v = temp_floor2 + vec2(0.5);
    vec2 index_h = vec2(SIFT_PER_STRIPE)* (temp_div2 - temp_floor2);
   vec2 tx = (index_h + stripe_index * vec2(SIFT_PER_STRIPE))* vec2(PIXEL_PER_SIFT) + 0.5;
   vec2 tpos1, tpos2; 
 vec4 tpos = vec4(tx, index_v);
   for(int i = 0; i < PIXEL_PER_SIFT; ++i){
 buf = texture2DRect(tex2, tpos.yw);
 data1 = texture2DRect(tex1, tpos.xz);
 val += (data1 * buf);
 tpos.xy = tpos.xy + vec2(1.0, 1.0);
 const float factor = 0.248050689697265625; 
 gl_FragColor =vec4(dot(val, vec4(factor)), index,  0);
 uniform sampler2DRect tex1, tex2; uniform vec2
 size;
 void main()
 #define PIXEL_PER_SIFT 
 #define SIFT_PER_STRIPE 
 tex1
 tex2
   vec4 val = vec4(0.0, 0.0, 0.0, 0.0), data1, buf;
   vec2 index = gl_FragCoord.yx; 
   vec2 stripe_size = size.xy * SIFT_PER_STRIPE;
 vec2 temp_div1 = index / stripe_size;
   vec2 stripe_index = floor(temp_div1);
   index = floor(stripe_size * (temp_div1 - stripe_index));
 vec2 temp_div2 = index  * vec2(1.0/ float(SIFT_PER_STRIPE));
 vec2 temp_floor2 = floor(temp_div2);
   vec2 index_v = temp_floor2 + vec2(0.5);
    vec2 index_h = vec2(SIFT_PER_STRIPE)* (temp_div2 - temp_floor2);
   vec4 tlpos = vec4((index_h + stripe_index * vec2(SIFT_PER_STRIPE)) + 0.5, index_v);
   vec3 loc1 = vec3(texture2DRect(texL1, tlpos.xz).xw, 1.0);
   vec3 loc2 = vec3(texture2DRect(texL2, tlpos.yw).xw, 1.0);
   vec3 hxloc1 = H* loc1;
   vec2 diff = loc2.xy- (hxloc1.xy/hxloc1.z);
   float disth = diff.x * diff.x + diff.y * diff.y;
   if(disth > size.z ) {gl_FragColor = vec4(0.0, index, 0.0); return;}
   vec3 fx1 = (F * loc1), ftx2 = (loc2 * F);
   float x2tfx1 = dot(loc2, fx1);
   vec4 temp = vec4(fx1.xy, ftx2.xy); 
   float sampson_error = (x2tfx1 * x2tfx1) / dot(temp, temp);
   if(sampson_error > size.w) {gl_FragColor = vec4(0.0, index, 0.0); return;}
   vec2 tx = (index_h + stripe_index * SIFT_PER_STRIPE)* vec2(PIXEL_PER_SIFT) + 0.5;
   vec2 tpos1, tpos2; 
 vec4 tpos = vec4(tx, index_v);
   for(int i = 0; i < PIXEL_PER_SIFT; ++i){
 buf = texture2DRect(tex2, tpos.yw);
 data1 = texture2DRect(tex1, tpos.xz);
 val += data1 * buf;
 tpos.xy = tpos.xy + vec2(1.0, 1.0);
 const float factor = 0.248050689697265625; 
 gl_FragColor =vec4(dot(val, vec4(factor)), index,  0.0);
 uniform sampler2DRect tex1, tex2;
 uniform sampler2DRect texL1;
 uniform sampler2DRect texL2; 
 uniform mat3 H; 
 uniform mat3 F; 
 uniform vec4
 size; 
 void main()
 texL1
 texL2
 #define BLOCK_WIDTH 16.0
 uniform sampler2DRect tex;
 uniform vec3 param;
 void main ()
 float index = gl_FragCoord.x + floor(gl_FragCoord.y) * BLOCK_WIDTH; 
 vec2 bestv = vec2(-1.0); float imax = -1.0;
 for(float i = 0.0; i < param.x; i ++){
 float v = texture2DRect(tex, vec2(i + 0.5, index)).r; 
 imax = v > bestv.r ? i : imax; 
 bestv  = v > bestv.r? vec2(v, bestv.r) : max(bestv, vec2(v));
 bestv = acos(min(bestv, 1.0));
 if(bestv.x >= param.y || bestv.x >= param.z * bestv.y) imax = -1.0;
 gl_FragColor = vec4(imax, bestv, index);
 param
 #define BLOCK_WIDTH 16.0
 uniform sampler2DRect tex; uniform vec3 param;
 void main ()
 float index = gl_FragCoord.x + floor(gl_FragCoord.y) * BLOCK_WIDTH; 
 vec2 bestv = vec2(-1.0); float imax = -1.0;
 for(float i = 0.0; i < param.x; i ++){
 float v = texture2DRect(tex, vec2(index, i + 0.5)).r; 
 imax = (v > bestv.r)? i : imax; 
 bestv  = v > bestv.r? vec2(v, bestv.r) : max(bestv, vec2(v));
 bestv = acos(min(bestv, 1.0));
 if(bestv.x >= param.y || bestv.x >= param.z * bestv.y) imax = -1.0;
 gl_FragColor = vec4(imax, bestv, index);
 Project for VS2005+ or set siftgpu_enable_cuda to 1 in makefile



 uniform sampler2DRect tex;
 void main(void){ float intensity = 0.0 ;  vec2 pos;
 float or = texture2DRect(tex, gl_TexCoord[0].st).r;
 intensity+= or * 
 ) , 0);
 pos = gl_TexCoord[0].st + vec2(float(texture2DRect(tex, pos).r;
 intensity+= 
 gl_FragColor.r = or;
 gl_FragColor.b  = intensity;}
 void main(void){ float intensity = 0.0;vec2 pos; 
 vec2 orb = texture2DRect(tex, gl_TexCoord[0].st).rb;
 intensity+= orb.y * 
 ) );
 pos = gl_TexCoord[0].st + vec2(0, float(intensity+= texture2DRect(tex, pos).b * 
 gl_FragColor.b = orb.y;
 gl_FragColor.g = intensity - orb.x;
 gl_FragColor.r = intensity;}
 void main(void){ vec4 result = vec4(0, 0, 0, 0);
 vec4 pc; vec2 coord; 
 ),0);
 coord = gl_TexCoord[0].xy + vec2(float(pc=texture2DRect(tex, coord);
 if(coord.x < 0.0) pc = pc.rrbb;
 )*pc.grab;
 result += vec4()*pc.rrbb;
 )*pc.ggaa;
 gl_FragColor = result;}
 vec4 pc; vec2 coord;
 coord = gl_TexCoord[0].xy + vec2(0, float(if(coord.y < 0.0) pc = pc.rgrg;
 )*pc.barg;
 )*pc.rgrg;
 )*pc.baba;
 uniform sampler2DRect tex; void main(void){
 float intensity = dot(vec3(0.299, 0.587, 0.114), texture2DRect(tex, gl_TexCoord[0].st ).rgb);
 gl_FragColor = vec4(intensity, intensity, intensity, 1.0);}
 void main(void){gl_FragColor.rg =  gl_TexCoord[0].st;}
 uniform sampler2DRect tex; void main(void){gl_FragColor.rg= texture2DRect(tex, gl_TexCoord[0].st).rg;}
 uniform sampler2DRect tex; void main ()
 vec4 v1, v2, gg;
 vec4 cc  = texture2DRect(tex, gl_TexCoord[0].xy);
 gg.x = texture2DRect(tex, gl_TexCoord[1].xy).r;
 gg.y = texture2DRect(tex, gl_TexCoord[2].xy).r;
 gg.z = texture2DRect(tex, gl_TexCoord[3].xy).r;
 gg.w = texture2DRect(tex, gl_TexCoord[4].xy).r;
 vec2 dxdy = (gg.yw - gg.xz); 
 float grad = 0.5*length(dxdy);
 float theta = grad==0.0? 0.0: atan(dxdy.y, dxdy.x);
 gl_FragData[0] = vec4(cc.rg, grad, theta);
 uniform sampler2DRect tex; uniform vec2 truncate;
 void main(){ gl_FragColor = texture2DRect(tex, min(gl_TexCoord[0].xy, truncate)); }
 uniform sampler2DRect tex; uniform sampler2DRect oTex;
 uniform float size; void main(){
 vec4 cc = texture2DRect(tex, gl_TexCoord[0].st);
 vec4 oo = texture2DRect(oTex, cc.rg);
 gl_FragColor.rg = cc.rg;
 gl_FragColor.b = oo.a;
 gl_FragColor.a = size;}
 oTex
 Orientation simplified on this hardware
 Descriptor ignored on this hardware
 void main(){gl_FragColor = vec4(0.0);}
 uniform sampler2DRect tex; void main(){
 gl_FragColor.rg= texture2DRect(tex, gl_TexCoord[0].st).rg; gl_FragColor.ba = vec2(0.0,1.0);
 uniform vec4 sizes; uniform sampler2DRect tex;
 void main(void){
 float fwidth = sizes.y; float twidth = sizes.z; float rwidth = sizes.w; 
 float index = 0.1*(fwidth*floor(gl_TexCoord[0].y) + gl_TexCoord[0].x);
 float px = mod(index, twidth);
 vec2 tpos= floor(vec2(px, index*rwidth))+0.5;
 vec4 cc = texture2DRect(tex, tpos );
 float size = 3.0 * cc.a; //sizes.x;// 
 gl_FragColor.zw = vec2(0.0, 1.0);
 if(any(lessThan(cc.xy,vec2(0.0))))  {gl_FragColor.xy = cc.xy; }
 else {float type = fract(px);
 vec2 dxy = vec2(0); 
 dxy.x = type < 0.1 ? 0.0 : (((type <0.5) || (type > 0.9))? size : -size);
 dxy.y = type < 0.2 ? 0.0 : (((type < 0.3) || (type > 0.7) )? -size :size); 
 float s = sin(cc.b); float c = cos(cc.b); 
 gl_FragColor.x = cc.x + c*dxy.x-s*dxy.y;
 gl_FragColor.y = cc.y + c*dxy.y+s*dxy.x;}
 sizes
 uniform sampler2DRect tex; void main(void){float r = texture2DRect(tex, gl_TexCoord[0].st).r;
 gl_FragColor = vec4(r, r, r, 1);}
 uniform sampler2DRect tex; void main(void){float g = 0.5+(20.0*texture2DRect(tex, gl_TexCoord[0].st).g);
 gl_FragColor = vec4(g, g, g, 0.0);}
 uniform sampler2DRect tex; void main(void){
 vec4 cc = texture2DRect(tex, gl_TexCoord[0].st);gl_FragColor = vec4(5.0* cc.bbb, 1.0);}
 uniform sampler2DRect tex; void main(void){
 vec4 cc = texture2DRect(tex, gl_TexCoord[0].st);
 if(cc.r ==0.0) discard; gl_FragColor =  (cc.r==1.0? vec4(1.0, 0.0, 0,1.0):vec4(0.0,1.0,0.0,1.0));}
 * min(2.0 * cc.r + 0.1, 1.0))
 #define THRESHOLD2 
 * min(2.0 * cc.r + 0.1, 1.0))
 #define THRESHOLD1 (define THRESHOLD0 (define THRESHOLD2 
 #define THRESHOLD1 
 #define THRESHOLD0 
 float dog = 0.0; 
 gl_FragData[1] = vec4(0, 0, 0, 0); 
 dog = cc.g > float(THRESHOLD0) && all(greaterThan(cc.gggg, max(v1, v2)))?1.0: 0.0;
 dog = cc.g < float(-THRESHOLD0) && all(lessThan(cc.gggg, min(v1, v2)))?0.5: dog;
 if(dog == 0.0) return;
 uniform sampler2DRect tex, texU, texD; void main ()
 vec4 v1, v2, gg, temp;
 vec2 TexRU = vec2(gl_TexCoord[2].x, gl_TexCoord[4].y); 
 vec4 cc  = texture2DRect(tex, gl_TexCoord[0].xy);
 temp =  texture2DRect(tex, gl_TexCoord[1].xy);
 v1.x =  temp.g;
 gg.x = temp.r;
 temp = texture2DRect(tex, gl_TexCoord[2].xy) ;
 v1.y = temp.g;
 gg.y = temp.r;
 temp = texture2DRect(tex, gl_TexCoord[3].xy) ;
 v1.z = temp.g;
 gg.z = temp.r;
 temp = texture2DRect(tex, gl_TexCoord[4].xy) ;
 v1.w = temp.g;
 gg.w = temp.r;
 v2.x = texture2DRect(tex, gl_TexCoord[5].xy).g;
 v2.y = texture2DRect(tex, gl_TexCoord[6].xy).g;
 v2.z = texture2DRect(tex, gl_TexCoord[7].xy).g;
 v2.w = texture2DRect(tex, TexRU.xy).g;
 vec2 dxdy = (gg.yw - gg.xz); 
 float grad = 0.5*length(dxdy);
 float theta = grad==0.0? 0.0: atan(dxdy.y, dxdy.x);
 gl_FragData[0] = vec4(cc.rg, grad, theta);
 float fxx, fyy, fxy; 
 vec4 D2 = v1.xyzw - cc.gggg;
 vec2 D4 = v2.xw - v2.yz;
 fxx = D2.x + D2.y;
 fyy = D2.z + D2.w;
 fxy = 0.25*(D4.x + D4.y);
 float fxx_plus_fyy = fxx + fyy;
 float score_up = fxx_plus_fyy*fxx_plus_fyy; 
 float score_down = (fxx*fyy - fxy*fxy);
 if( score_down <= 0.0 || score_up > THRESHOLD2 * score_down)return;
 else{
 if(cc.g > v3.x || any(greaterThan(cc.gggg, v4)) ||any(greaterThan(cc.gggg, v6)))return; 
 v3.y = texture2DRect(texD, gl_TexCoord[0].xy).g;
 v5.x = texture2DRect(texD, gl_TexCoord[1].xy).g;
 v5.y = texture2DRect(texD, gl_TexCoord[2].xy).g;
 v5.z = texture2DRect(texD, gl_TexCoord[3].xy).g;
 v5.w = texture2DRect(texD, gl_TexCoord[4].xy).g;
 v6.x = texture2DRect(texD, gl_TexCoord[5].xy).g;
 v6.y = texture2DRect(texD, gl_TexCoord[6].xy).g;
 v6.z = texture2DRect(texD, gl_TexCoord[7].xy).g;
 v6.w = texture2DRect(texD, TexRU.xy).g;
 if(cc.g > v3.y || any(greaterThan(cc.gggg, v5)) ||any(greaterThan(cc.gggg, v6)))return; 
 if(dog == 1.0)
 if(cc.g < v3.x || any(lessThan(cc.gggg, v4)) ||any(lessThan(cc.gggg, v6)))return; 
 v3.y = texture2DRect(texD, gl_TexCoord[0].xy).g;
 v5.x = texture2DRect(texD, gl_TexCoord[1].xy).g;
 v5.y = texture2DRect(texD, gl_TexCoord[2].xy).g;
 v5.z = texture2DRect(texD, gl_TexCoord[3].xy).g;
 v5.w = texture2DRect(texD, gl_TexCoord[4].xy).g;
 v6.x = texture2DRect(texD, gl_TexCoord[5].xy).g;
 v6.y = texture2DRect(texD, gl_TexCoord[6].xy).g;
 v6.z = texture2DRect(texD, gl_TexCoord[7].xy).g;
 v6.w = texture2DRect(texD, TexRU.xy).g;
 if(cc.g < v3.y || any(lessThan(cc.gggg, v5)) ||any(lessThan(cc.gggg, v6)))return; 
 v3.x = texture2DRect(texU, gl_TexCoord[0].xy).g;
 v4.x = texture2DRect(texU, gl_TexCoord[1].xy).g;
 v4.y = texture2DRect(texU, gl_TexCoord[2].xy).g;
 v4.z = texture2DRect(texU, gl_TexCoord[3].xy).g;
 v4.w = texture2DRect(texU, gl_TexCoord[4].xy).g;
 v6.x = texture2DRect(texU, gl_TexCoord[5].xy).g;
 v6.y = texture2DRect(texU, gl_TexCoord[6].xy).g;
 v6.z = texture2DRect(texU, gl_TexCoord[7].xy).g;
 v6.w = texture2DRect(texU, TexRU.xy).g;
 vec2 D5 = 0.5*(v1.yw-v1.xz); 
 float fx = D5.x, fy = D5.y ; 
 float fs, fss , fxs, fys ; 
 vec2 v3; vec4 v4, v5, v6;
 gl_FragData[1] = vec4( dog, dxys); 
 bool dog_test = (abs(cc.g + 0.5*dot(vec3(fx, fy, fs), dxys ))<= float(THRESHOLD1)) ;
 if(dog_test || any(greaterThan(abs(dxys), vec3(1.0)))) dog = 0.0;
 dxys.z = A2.w /A2.z;
    
 dxys.y = A1.w - dxys.z*A1.z;
    
 dxys.x = A0.w - dxys.z*A0.z - dxys.y*A0.y;
 vec3 dxys = vec3(0.0);
 vec4 A0, A1, A2 ;
 A0 = vec4(fxx, fxy, fxs, -fx);
 A1 = vec4(fxy, fyy, fys, -fy);
 A2 = vec4(fxs, fys, fss, -fs);
 vec3 x3 = abs(vec3(fxx, fxy, fxs));
 float maxa = max(max(x3.x, x3.y), x3.z);
 if(maxa >= 1e-10 ) {
 if(x3.y ==maxa )
 vec4 TEMP = A1; A1 = A0; A0 = TEMP;
 }else if( x3.z == maxa )
 vec4 TEMP = A2; A2 = A0; A0 = TEMP;
 A0 /= A0.x;
 A1 -= A1.x * A0;
 A2 -= A2.x * A0;
 vec2 x2 = abs(vec2(A1.y, A2.y));
 if( x2.y > x2.x )
 vec3 TEMP = A2.yzw;
 A2.yzw = A1.yzw;
 A1.yzw = TEMP;
 x2.x = x2.y;
 if(x2.x >= 1e-10) {
 A1.yzw /= A1.y;
 A2.yzw -= A2.y * A1.yzw;
 if(abs(A2.z) >= 1e-10) {
 fs = 0.5*( v3.x - v3.y );  
 fss = v3.x + v3.y - cc.g - cc.g;
 fxs = 0.25 * ( v4.y + v5.x - v4.x - v5.y);
 fys = 0.25 * ( v4.w + v5.z - v4.z - v5.w);
 gl_FragData[1] =  vec4( dog, 0.0, 0.0, 0.0) ;
 gl_FragData[1] =  vec4(dog, 0.0, 0.0, 0.0) ;
 Detection simplified on this hardware
 texU
 texD
 uniform sampler2DRect tex; void main (void){
 vec4 helper = vec4( texture2DRect(tex, gl_TexCoord[0].xy).r,  texture2DRect(tex, gl_TexCoord[1].xy).r, texture2DRect(tex, gl_TexCoord[2].xy).r, texture2DRect(tex, gl_TexCoord[3].xy).r);
 gl_FragColor = vec4(greaterThan(helper, vec4(0.0,0.0,0.0,0.0)));
 uniform sampler2DRect tex;uniform vec2 bbox;
 void main (void ){
 vec4 helper = vec4( texture2DRect(tex, gl_TexCoord[0].xy).r,  texture2DRect(tex, gl_TexCoord[1].xy).r, texture2DRect(tex, gl_TexCoord[2].xy).r, texture2DRect(tex, gl_TexCoord[3].xy).r);
 bvec4 helper2 = bvec4( 
 all(lessThan(gl_TexCoord[0].xy , bbox)) && helper.x >0.0, all(lessThan(gl_TexCoord[1].xy , bbox)) && helper.y >0.0, all(lessThan(gl_TexCoord[2].xy , bbox)) && helper.z >0.0, all(lessThan(gl_TexCoord[3].xy , bbox)) && helper.w >0.0);
 gl_FragColor = vec4(helper2);
 bbox
 uniform sampler2DRect tex; void main (void){
 vec4 helper; vec4 helper2; 
 helper = texture2DRect(tex, gl_TexCoord[0].xy); helper2.xy = helper.xy + helper.zw; 
 helper = texture2DRect(tex, gl_TexCoord[1].xy); helper2.zw = helper.xy + helper.zw; 
 gl_FragColor.rg = helper2.xz + helper2.yw;
 helper = texture2DRect(tex, gl_TexCoord[2].xy); helper2.xy = helper.xy + helper.zw; 
 helper = texture2DRect(tex, gl_TexCoord[3].xy); helper2.zw = helper.xy + helper.zw; 
 gl_FragColor.ba= helper2.xz+helper2.yw;
 tex0
 uniform sampler2DRect tex
 uniform float width;
 void main(void){
 float  index = floor(gl_TexCoord[0].y) * width + floor(gl_TexCoord[0].x);
 vec2 pos = vec2(0.5, 0.5);
 uniform sampler2DRect tex;
 vec4 tc = texture2DRect( tex, gl_TexCoord[0].xy);
 vec2 pos = tc.rg; float index = tc.b;
 vec2 sum; 
 vec4 cc;
 vec2 cpos = vec2(-0.5, 0.5);
 vec2 opos;
 , pos);
 cc = texture2DRect(tex
 sum.x = cc.r + cc.g; sum.y = sum.x + cc.b;  
 if (index <cc.r){ opos = cpos.xx;}
 else if(index < sum.x ) {opos = cpos.yx; index -= cc.r;}
 else if(index < sum.y ) {opos = cpos.xy; index -= sum.x;}
 else {opos = cpos.yy; index -= sum.y;}
 pos = (pos + pos + opos);
 gl_FragColor = vec4(pos, index, 1.0);
 void main()
 vec4 bins[10];
 bins[0] = vec4(0.0);bins[1] = vec4(0.0);bins[2] = vec4(0.0);
 bins[3] = vec4(0.0);bins[4] = vec4(0.0);bins[5] = vec4(0.0);
 bins[6] = vec4(0.0);bins[7] = vec4(0.0);bins[8] = vec4(0.0);
 vec4 loc = texture2DRect(tex, gl_TexCoord[0].xy);
 vec2 pos = loc.xy;
 bool orientation_mode = (size.z != 0.0);
 float sigma = orientation_mode? abs(size.z) : loc.w; 
 uniform sampler2DRect texS;
 uniform sampler2DRect tex;
 uniform sampler2DRect gradTex;
 uniform vec4 size;
 #define ORIENTATION_THRESHOLD 
 #define SAMPLE_WF float(define GAUSSIAN_WF float(if(offset.x < 0.6) sigma = -sigma; 
 #endif
 if(orientation_mode){
 vec4 offset = texture2DRect(texS, pos);
 pos.xy = pos.xy + offset.yz; 
 sigma = sigma * pow(size.w, offset.w);
 #if 
 //bool fixed_orientation = (size.z < 0.0);
 if(size.z < 0.0) {gl_FragData[0] = vec4(pos, 0.0, sigma); return;}
 float gsigma = sigma * GAUSSIAN_WF;
 vec2 win = abs(vec2(sigma * (SAMPLE_WF * GAUSSIAN_WF))) ;
 vec2 dim = size.xy;
 float dist_threshold = win.x*win.x+0.5;
 float factor = -0.5/(gsigma*gsigma);
 vec4 sz;
 vec2 spos;
 //if(any(pos.xy <= 1)) discard;
 sz.xy = max( pos - win, vec2(1,1));
 sz.zw = min( pos + win, dim-vec2(2, 2));
 sz = floor(sz)+0.5;
 for(spos.y = sz.y; spos.y <= sz.w;
 spos.y+=1.0)
 for(spos.x = sz.x; spos.x <= sz.z;
 spos.x+=1.0)
 vec2 offset = spos - pos;
 float sq_dist = dot(offset,offset);
 if( sq_dist < dist_threshold){
 vec4 cc = texture2DRect(gradTex, spos);
 float grad = cc.b;
 float theta = cc.a;
 float idx = floor(degrees(theta)*0.1);
 if(idx < 0.0 ) idx += 36.0;
 float weight = grad*exp(sq_dist * factor);
 float vidx = fract(idx * 0.25) * 4.0;//mod(idx, 4.0) ;
 vec4 inc = weight*vec4(equal(vec4(vidx), vec4(0.0,1.0,2.0,3.0)));
 int iidx = int((idx*0.25));
 bins[iidx]+=inc;
 if(idx < 16.0)
 if(idx < 8.0)
 if(idx < 4.0)
 bins[0]+=inc;}
 else
 bins[1]+=inc;}
 }else
 if(idx < 12.0){
 bins[2]+=inc;}
 else
 bins[3]+=inc;}
 }else if(idx < 32.0)
 if(idx < 24.0)
 if(idx <20.0)
 bins[4]+=inc;}
 else
 bins[5]+=inc;}
 }else
 if(idx < 28.0){
 bins[6]+=inc;}
 else
 bins[7]+=inc;}
 }else 
 bins[8]+=inc;
 gradTex
 texS
 //mat3 m1 = mat3(1, 0, 0, 3, 1, 0, 6, 3, 1)/27.0;  
 mat3 m1 = mat3(1, 3, 6, 0, 1, 3,0, 0, 1)/27.0;  
 mat4 m2 = mat4(7, 6, 3, 1, 6, 7, 6, 3, 3, 6, 7, 6, 1, 3, 6, 7)/27.0;
 #define FILTER_CODE(i) {
 vec4 newb
 (bins[i]* m2);
 newb.xyz
 ( prev.yzw * m1);
 prev = bins[i];
 newb.wzy
 ( bins[i+1].zyx *m1);
 bins[i] = newb;}
 for (int j=0; j<2; j++)
 vec4 prev  = bins[8];
 bins[9]
 = bins[0];
 for (int i=0; i<9; i++)
 FILTER_CODE(i);
   FILTER_CODE(0);
   FILTER_CODE(1);
   FILTER_CODE(2);
   FILTER_CODE(3);
   FILTER_CODE(4);
   FILTER_CODE(5);
   FILTER_CODE(6);
   FILTER_CODE(7);
   FILTER_CODE(8);
 vec4 maxh; vec2 maxh2; 
 vec4 maxh4 = max(max(max(max(max(max(max(max(bins[0], bins[1]), bins[2]), bins[3]), bins[4]), bins[5]), bins[6]), bins[7]), bins[8]);
 maxh2 = max(maxh4.xy, maxh4.zw); maxh = vec4(max(maxh2.x, maxh2.y));
 vec4 Orientations = vec4(0.0, 0.0, 0.0, 0.0);
 vec4 weights = vec4(0.0,0.0,0.0,0.0);
 {test = greaterThan(bins[i], hh);
 if(weight <=weights.g){}\
 else if(weight >weights.r)\
 {weights.rg = vec2(weight, weights.r); Orientations.rg = vec2(th, Orientations.r);}\
 else {weights.g = weight; Orientations.g = th;}
 if(weight <=weights.b){}\
 else if(weight >weights.r)\
 {weights.rgb = vec3(weight, weights.rg); Orientations.rgb = vec3(th, Orientations.rg);}\
 else if(weight >weights.g)\
 {weights.gb = vec2(weight, weights.g); Orientations.gb = vec2(th, Orientations.g);}\
 else {weights.b = weight; Orientations.b = th;}
 if(weight <=weights.a){}\
 else if(weight >weights.r)\
 {weights = vec4(weight, weights.rgb); Orientations = vec4(th, Orientations.rgb);}\
 else if(weight >weights.g)\
 {weights.gba = vec3(weight, weights.gb); Orientations.gba = vec3(th, Orientations.gb);}\
 else if(weight >weights.b)\
 {weights.ba = vec2(weight, weights.b); Orientations.ba = vec2(th, Orientations.b);}\
 else {weights.a = weight; Orientations.a = th;}
 float Orientation;
 if(npeaks<=0.0){\
 test = equal(bins[i], maxh)
 npeaks++;
 Orientation = th;
 prevb = bins[i].w;
 else if(test.a && bins[i].w > bins[i].z && bins[i].w > bins[i+1].x )
    float
 di = -0.5 * (bins[i+1].x-bins[i].z) / (bins[i+1].x+bins[i].z-bins[i].w - bins[i].w) ; \
    float
 th = (k+di+3.5);
 float weight = bins[i].w;
 if(test.b && all( greaterThan( bins[i].zz , bins[i].yw)) )
    float
 di = -0.5 * (bins[i].w-bins[i].y) / (bins[i].w+bins[i].y-bins[i].z- bins[i].z) ; \
    float
 th = (k+di+2.5);
 float weight = bins[i].z;
 else if(test.g && all( greaterThan(bins[i].yy , bins[i].xz)) )
    float
 di = -0.5 * (bins[i].z-bins[i].x) / (bins[i].z+bins[i].x-bins[i].y- bins[i].y) ; \
    float
 th = (k+di+1.5);
 float weight = bins[i].y;
 if( any ( test) )
 if(test.r && bins[i].x > prevb && bins[i].x > bins[i].y )
    float
 di = -0.5 * (bins[i].y-prevb) / (bins[i].y+prevb-bins[i].x - bins[i].x) ; \
    float
 th = (k+di+0.5);
 float weight = bins[i].x;
 #define FINDPEAK(i, k)
 vec4 hh = maxh * ORIENTATION_THRESHOLD;
 bvec4 test;
 bins[9] = bins[0];
 float npeaks = 0.0, k = 0.0;
 float prevb
 = bins[8].w;
 for (int i = 0; i < 9; i++)
 FINDPEAK(i, k);
 k = k + 4.0;
 vec4 hh = maxh * ORIENTATION_THRESHOLD; bvec4 test;
 bins[9] = bins[0];
 float npeaks = 0.0;
 float prevb
 = bins[8].w;
 FINDPEAK(0, 0.0);
 FINDPEAK(1, 4.0);
 FINDPEAK(2, 8.0);
 FINDPEAK(3, 12.0);
 FINDPEAK(4, 16.0);
 FINDPEAK(5, 20.0);
 FINDPEAK(6, 24.0);
 FINDPEAK(7, 28.0);
 FINDPEAK(8, 32.0);
 ), vec4(greaterThan(weights, hh)));
 gl_FragData[0] = vec4(pos, npeaks, sigma);
 gl_FragData[1] = radians((Orientations )*10.0);
 }else{
 gl_FragData[0] = vec4(pos, radians((Orientations.x)*10.0), sigma);
 if(orientation_mode){
 npeaks = dot(vec4(1,1, gl_FragData[0] = vec4(pos, radians((Orientation)*10.0), sigma);
 #define M_PI 3.14159265358979323846
 #define TWO_PI (2.0*M_PI)
 #define RPI 1.2732395447351626861510701069801
 #define WF  size.z
 uniform sampler2DRect tex;
 uniform sampler2DRect gradTex;
 uniform vec4 dsize;
 uniform vec3 size;
 void main()
 vec2 dim
 = size.xy;
 //image size
 float index = dsize.x*floor(gl_TexCoord[0].y * 0.5) + gl_TexCoord[0].x;
 float idx = 8.0 * fract(index * 0.125) + 8.0 * floor(2.0 * fract(gl_TexCoord[0].y * 0.5));
 index = floor(index*0.125) + 0.49;  
 vec2 coord = floor( vec2( mod(index, dsize.z), index*dsize.w)) + 0.5 ;
 vec2 pos = texture2DRect(tex, coord).xy;
 if(any(lessThanEqual(pos.xy,  vec2(1.0))) || any(greaterThanEqual(pos.xy, dim-1.0)))// discard;
 { gl_FragData[0] = gl_FragData[1] = vec4(0.0); return; }
 float  anglef = texture2DRect(tex, coord).z;
 if(anglef > M_PI) anglef -= TWO_PI;
 float sigma = texture2DRect(tex, coord).w; 
 float spt  = abs(sigma * WF);
 //default to be 3*sigma
 vec4 cscs, rots;
 cscs.y = sin(anglef);
 cscs.x = cos(anglef);
 cscs.zw = - cscs.xy;
 rots = cscs /spt;
 cscs *= spt; 
 vec4 temp; vec2 pt, offsetpt;
 /*the fraction part of idx is .5*/
 offsetpt.x = 4.0* fract(idx*0.25) - 2.0;
 offsetpt.y = floor(idx*0.25) - 1.5;
 temp = cscs.xwyx*offsetpt.xyxy;
 pt = pos + temp.xz + temp.yw;
 vec2 bwin = abs(cscs.xy);
 float bsz = bwin.x + bwin.y;
 vec4 sz;
 sz.xy = max(pt - vec2(bsz), vec2(1,1));
 sz.zw = min(pt + vec2(bsz), dim - vec2(2, 2));
 sz = floor(sz)+0.5;
 vec4 DA, DB; vec2 spos;
 DA = DB  = vec4(0.0, 0.0, 0.0, 0.0);
 for(spos.y = sz.y; spos.y <= sz.w;
 spos.y+=1.0)
 for(spos.x = sz.x; spos.x <= sz.z;
 spos.x+=1.0)
 vec2 diff = spos - pt;
 temp = rots.xywx * diff.xyxy;
 vec2 nxy = (temp.xz + temp.yw); 
 vec2 nxyn = abs(nxy);
 if(all( lessThan(nxyn, vec2(1.0)) ))
 vec4 cc = texture2DRect(gradTex, spos);
 float mod = cc.b;
 float angle = cc.a;
 float theta0 = RPI * (anglef - angle);
 float theta = theta0 < 0.0? theta0 + 8.0 : theta0;;
 diff = nxy + offsetpt.xy;
 float ww = exp(-0.125*dot(diff, diff));
 vec2 weights = vec2(1) - nxyn;
 float weight = weights.x * weights.y *mod*ww; 
 float theta1 = floor(theta); 
 float weight2 = (theta - theta1) * weight;
 float weight1 = weight - weight2;
 DA += vec4(equal(vec4(theta1),  vec4(0, 1, 2, 3)))*weight1;
 DA += vec4(equal(vec4(theta1),  vec4(7, 0, 1, 2)))*weight2; 
 DB += vec4(equal(vec4(theta1),  vec4(4, 5, 6, 7)))*weight1;
 DB += vec4(equal(vec4(theta1),  vec4(3, 4, 5, 6)))*weight2; 
 gl_FragData[0] = DA; gl_FragData[1] = DB;
 dsize
 uniform sampler2DRect tex; void main(){
 float intensity = dot(vec3(0.299, 0.587, 0.114), texture2DRect(tex,gl_TexCoord[0].xy ).rgb);
 gl_FragColor= vec4(intensity, intensity, intensity, 1.0);}
 uniform sampler2DRect tex; void main(){
 gl_FragColor= vec4(texture2DRect(tex,gl_TexCoord[0].st ).r,texture2DRect(tex,gl_TexCoord[1].st ).r, texture2DRect(tex,gl_TexCoord[2].st ).r,texture2DRect(tex,gl_TexCoord[3].st ).r);}
 uniform sampler2DRect tex;  uniform vec4 truncate; void main(){
 vec4 cc = texture2DRect(tex, min(gl_TexCoord[0].xy, truncate.xy)); 
 bvec2 ob = lessThan(gl_TexCoord[0].xy, truncate.xy);
 if(ob.y) { gl_FragColor = (truncate.z ==0.0 ? cc.rrbb : cc.ggaa); } 
 else if(ob.x) {gl_FragColor = (truncate.w <1.5 ? cc.rgrg : cc.baba);} 
 else {
 vec4 weights = vec4(vec4(0.0, 1.0, 2.0, 3.0) == truncate.wwww);
 float v = dot(weights, cc); gl_FragColor = vec4(v);}}
 uniform sampler2DRect tex; uniform sampler2DRect texp; void main ()
 vec4 v1, v2, gg;
 vec4 cc = texture2DRect(tex, gl_TexCoord[0].xy);
 vec4 cp = texture2DRect(texp, gl_TexCoord[0].xy);
 gl_FragData[0] = cc - cp; 
 vec4 cl = texture2DRect(tex, gl_TexCoord[1].xy); vec4 cr = texture2DRect(tex, gl_TexCoord[2].xy);
 vec4 cd = texture2DRect(tex, gl_TexCoord[3].xy); vec4 cu = texture2DRect(tex, gl_TexCoord[4].xy);
 vec4 dx = (vec4(cr.rb, cc.ga) - vec4(cc.rb, cl.ga)).zxwy;
 vec4 dy = (vec4(cu.rg, cc.ba) - vec4(cc.rg, cd.ba)).zwxy;
 vec4 grad = 0.5 * sqrt(dx*dx + dy * dy);
 gl_FragData[1] = grad;
 vec4 invalid = vec4(equal(grad, vec4(0.0)));
 vec4 ov = atan(dy, dx + invalid);
 gl_FragData[2] = ov; 
 texp
 uniform sampler2DRect tex; uniform sampler2DRect oTex; uniform vec2 size; void main(){
 vec4 cc = texture2DRect(tex, gl_TexCoord[0].xy);
 vec2 co = cc.xy * 0.5; 
 vec4 oo = texture2DRect(oTex, co);
 bvec2 bo = lessThan(fract(co), vec2(0.5)); 
 float o = bo.y? (bo.x? oo.r : oo.g) : (bo.x? oo.b : oo.a); 
 gl_FragColor = vec4(cc.rg, o, size.x * pow(size.y, cc.a));}
 uniform sampler2DRect tex;void main(){
 gl_FragColor= vec4(texture2DRect(tex, gl_TexCoord[0].xy).rg, 0,1);}
 uniform sampler2DRect tex; uniform vec4 sizes; void main(){
 float fwidth = sizes.y; 
 float twidth = sizes.z; 
 float rwidth = sizes.w; 
 float index = 0.1*(fwidth*floor(gl_TexCoord[0].y) + gl_TexCoord[0].x);
 float px = mod(index, twidth);
 vec2 tpos= floor(vec2(px, index*rwidth))+0.5;
 vec4 cc = texture2DRect(tex, tpos );
 float size = 3.0 * cc.a; 
 gl_FragColor.zw = vec2(0.0, 1.0);
 if(any(lessThan(cc.xy,vec2(0.0)))) {gl_FragColor.xy = cc.xy;}else 
 float type = fract(px);
 vec2 dxy; float s, c;
 dxy.x = type < 0.1 ? 0.0 : (((type <0.5) || (type > 0.9))? size : -size);
 dxy.y = type < 0.2 ? 0.0 : (((type < 0.3) || (type > 0.7) )? -size :size); 
 s = sin(cc.b); c = cos(cc.b); 
 gl_FragColor.x = cc.x + c*dxy.x-s*dxy.y;
 gl_FragColor.y = cc.y + c*dxy.y+s*dxy.x;}
 uniform sampler2DRect tex; void main(){
 vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy);
 bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));
 float v = ff.y?(ff.x? pc.r : pc.g):(ff.x?pc.b:pc.a); gl_FragColor = vec4(vec3(v), 1.0);}
 uniform sampler2DRect tex; void main(){
 vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));
 float v = ff.y ?(ff.x ? pc.r : pc.g):(ff.x ? pc.b : pc.a);float g = (0.5+20.0*v);
 gl_FragColor = vec4(g, g, g, 1.0);}
 uniform sampler2DRect tex; void main(){
 vec4 pc = texture2DRect(tex, gl_TexCoord[0].xy); bvec2 ff = lessThan(fract(gl_TexCoord[0].xy), vec2(0.5));
 float v = ff.y ?(ff.x ? pc.r : pc.g):(ff.x ? pc.b : pc.a); gl_FragColor = vec4(5.0 *vec3(v), 1.0); }
 uniform sampler2DRect tex; void main(){
 vec4 oc = texture2DRect(tex, gl_TexCoord[0].xy); 
 vec4 cc = vec4(equal(abs(oc.rrrr), vec4(1.0, 2.0, 3.0, 4.0))); 
 bvec2 ff = lessThan(fract(gl_TexCoord[0].xy) , vec2(0.5));
 float v = ff.y ?(ff.x ? cc.r : cc.g):(ff.x ? cc.b : cc.a);
 if(v == 0.0) discard;
 else if(oc.r > 0.0) gl_FragColor = vec4(1.0, 0.0, 0,1.0); 
 else gl_FragColor = vec4(0.0,1.0,0.0,1.0);
 uniform sampler2DRect tex;
 uniform sampler2DRect gtex;
 uniform sampler2DRect otex; uniform vec4 size;
 void main()
 vec4 bins[10];
 bins[0] = vec4(0.0);bins[1] = vec4(0.0);bins[2] = vec4(0.0);
 bins[3] = vec4(0.0);bins[4] = vec4(0.0);bins[5] = vec4(0.0);
 bins[6] = vec4(0.0);bins[7] = vec4(0.0);bins[8] = vec4(0.0);
 vec4 sift = texture2DRect(tex, gl_TexCoord[0].xy);
 vec2 pos = sift.xy; 
 bool orientation_mode = (size.z != 0.0);
 float sigma = orientation_mode? (abs(size.z) * pow(size.w, sift.w) * sift.z) : (sift.w); 
 //bool fixed_orientation = (size.z < 0.0);
 if(size.z < 0.0) {gl_FragData[0] = vec4(pos, 0.0, sigma); return;}
 float gsigma = sigma * GAUSSIAN_WF;
 vec2 win = abs(vec2(sigma * (SAMPLE_WF * GAUSSIAN_WF)));
 vec2 dim = size.xy;
 vec4 dist_threshold = vec4(win.x*win.x+0.5);
 float factor = -0.5/(gsigma*gsigma);
 vec4 sz;
 vec2 spos;
 //if(any(pos.xy <= float(1))) discard;
 sz.xy = max( pos - win, vec2(2.0,2.0));
 sz.zw = min( pos + win, dim-vec2(3.0));
 sz = floor(sz*0.5) + 0.5; 
 for(spos.y = sz.y; spos.y <= sz.w;
 spos.y+=1.0)
 for(spos.x = sz.x; spos.x <= sz.z;
 spos.x+=1.0)
 vec2 offset = 2.0 * spos - pos - vec2(0.5);
 vec4 off = vec4(offset, offset + vec2(1));
 vec4 distsq = off.xzxz * off.xzxz + off.yyww * off.yyww;
 bvec4 inside = lessThan(distsq, dist_threshold);
 if(any(inside))
 vec4 gg = texture2DRect(gtex, spos);
 vec4 oo = texture2DRect(otex, spos);
 vec4 weight = gg * exp(distsq * factor);
 vec4 idxv  = floor(degrees(oo)*0.1); 
 idxv+= (vec4(lessThan(idxv, vec4(0.0)))*36.0); 
 vec4 vidx = fract(idxv * 0.25) * 4.0;//mod(idxv, 4.0);
 for(int i = 0 ; i < 4; i++)
 if(inside[i])
 float idx = idxv[i];
 vec4 inc = weight[i] * vec4(equal(vec4(vidx[i]), vec4(0.0,1.0,2.0,3.0)));
 int iidx = int(floor(idx*0.25));
 bins[iidx]+=inc;
 for(int i = 0 ; i < 4; i++)
 if(inside[i])
 float idx = idxv[i]; 
 vec4 inc = weight[i] * vec4(equal(vec4(vidx[i]), vec4(0,1,2,3)));
 if(idx < 16.0)
 if(idx < 8.0)
 if(idx < 4.0)
 bins[0]+=inc;}
 else
 bins[1]+=inc;}
 }else
 if(idx < 12.0){
 bins[2]+=inc;}
 else
 bins[3]+=inc;}
 }else if(idx < 32.0)
 if(idx < 24.0)
 if(idx <20.0)
 bins[4]+=inc;}
 else
 bins[5]+=inc;}
 }else
 if(idx < 28.0){
 bins[6]+=inc;}
 else
 bins[7]+=inc;}
 }else 
 bins[8]+=inc;
 gtex
 otex
 uniform sampler2DRect tex; void main ()
 vec4 key = vec4(texture2DRect(tex, gl_TexCoord[0].xy).r, texture2DRect(tex, gl_TexCoord[1].xy).r, texture2DRect(tex, gl_TexCoord[2].xy).r, texture2DRect(tex, gl_TexCoord[3].xy).r); 
 gl_FragColor = vec4(notEqual(key, vec4(0.0))); 
 uniform sampler2DRect tex; uniform vec4 bbox; void main ()
 vec4 helper1 = vec4(equal(vec4(abs(texture2DRect(tex, gl_TexCoord[0].xy).r)), vec4(1.0, 2.0, 3.0, 4.0)));
 vec4 helper2 = vec4(equal(vec4(abs(texture2DRect(tex, gl_TexCoord[1].xy).r)), vec4(1.0, 2.0, 3.0, 4.0)));
 vec4 helper3 = vec4(equal(vec4(abs(texture2DRect(tex, gl_TexCoord[2].xy).r)), vec4(1.0, 2.0, 3.0, 4.0)));
 vec4 helper4 = vec4(equal(vec4(abs(texture2DRect(tex, gl_TexCoord[3].xy).r)), vec4(1.0, 2.0, 3.0, 4.0)));
 vec4 bx1 = vec4(lessThan(gl_TexCoord[0].xxyy, bbox)); 
 vec4 bx4 = vec4(lessThan(gl_TexCoord[3].xxyy, bbox)); 
 vec4 bx2 = vec4(bx4.xy, bx1.zw); 
 vec4 bx3 = vec4(bx1.xy, bx4.zw);
 helper1 = min(min(bx1.xyxy, bx1.zzww), helper1);
 helper2 = min(min(bx2.xyxy, bx2.zzww), helper2);
 helper3 = min(min(bx3.xyxy, bx3.zzww), helper3);
 helper4 = min(min(bx4.xyxy, bx4.zzww), helper4);
 gl_FragColor.r = float(any(greaterThan(max(helper1.xy, helper1.zw), vec2(0.0))));
 gl_FragColor.g = float(any(greaterThan(max(helper2.xy, helper2.zw), vec2(0.0))));
 gl_FragColor.b = float(any(greaterThan(max(helper3.xy, helper3.zw), vec2(0.0))));
 gl_FragColor.a = float(any(greaterThan(max(helper4.xy, helper4.zw), vec2(0.0))));
 uniform sampler2DRect tex; uniform sampler2DRect ktex; void main()
 vec4 tc = texture2DRect( tex, gl_TexCoord[0].xy);
 vec2 pos = tc.rg; float index = tc.b;
 vec4 tk = texture2DRect( ktex, pos); 
 vec4 keys = vec4(equal(abs(tk.rrrr), vec4(1.0, 2.0, 3.0, 4.0))); 
 vec2 opos; 
 opos.x = dot(keys, vec4(-0.5, 0.5, -0.5, 0.5));
 opos.y = dot(keys, vec4(-0.5, -0.5, 0.5, 0.5));
 gl_FragColor = vec4(opos + pos * 2.0 + tk.yz, 1.0, tk.w);
 uniform sampler2DRect tex; uniform sampler2DRect ktex; void main()
 vec4 tc = texture2DRect( tex, gl_TexCoord[0].xy);
 vec2 pos = tc.rg; float index = tc.b;
 vec4 tk = texture2DRect( ktex, pos); 
 vec4 keys = vec4(equal(abs(tk.rrrr), vec4(1.0, 2.0, 3.0, 4.0))) 
 vec2 opos; 
 opos.x = dot(keys, vec4(-0.5, 0.5, -0.5, 0.5));
 opos.y = dot(keys, vec4(-0.5, -0.5, 0.5, 0.5));
 gl_FragColor = vec4(opos + pos * 2.0 + tk.yz, sign(tk.r), tk.w);
 ktex
 uniform sampler2DRect tex; void main ()
 vec4 helper; vec4 helper2; 
 helper = texture2DRect(tex, gl_TexCoord[0].xy); helper2.xy = helper.xy + helper.zw; 
 helper = texture2DRect(tex, gl_TexCoord[1].xy); helper2.zw = helper.xy + helper.zw; 
 gl_FragColor.rg = helper2.xz + helper2.yw;
 helper = texture2DRect(tex, gl_TexCoord[2].xy); helper2.xy = helper.xy + helper.zw; 
 helper = texture2DRect(tex, gl_TexCoord[3].xy); helper2.zw = helper.xy + helper.zw; 
 gl_FragColor.ba= helper2.xz+helper2.yw;
 #define REPEAT4(FUNCTION)\
 for(int i = 0; i < 4; ++i)\
 FUNCTION(i);\
 #define REPEAT4(FUNCTION)\
 FUNCTION(0);\
 FUNCTION(1);\
 FUNCTION(2);\
 FUNCTION(3);
 #define DEFINE_EXTRA() vec4 ii = texture2DRect(texI, gl_TexCoord[0].xy); ii = min(2.0 * ii + 0.1, 1.0) 
 #define MOVE_EXTRA(idx)
 ii[0] = ii[idx]
 * ii[0])
 #define THRESHOLD2 
 * ii[i])
 #define THRESHOLD1 (define THRESHOLD0(i) (uniform sampler2DRect texI;
 #define DEFINE_EXTRA()
 #define MOVE_EXTRA(idx) 
 #define THRESHOLD0(i) 
 uniform sampler2DRect tex; uniform sampler2DRect texU;
 uniform sampler2DRect texD; void main ()
 vec2 TexRU = vec2(gl_TexCoord[2].x, gl_TexCoord[4].y); 
 vec4 ccc = texture2DRect(tex, gl_TexCoord[0].xy);
 vec4 clc = texture2DRect(tex, gl_TexCoord[1].xy);
 vec4 crc = texture2DRect(tex, gl_TexCoord[2].xy);
 vec4 ccd = texture2DRect(tex, gl_TexCoord[3].xy);
 vec4 ccu = texture2DRect(tex, gl_TexCoord[4].xy);
 vec4 cld = texture2DRect(tex, gl_TexCoord[5].xy);
 vec4 clu = texture2DRect(tex, gl_TexCoord[6].xy);
 vec4 crd = texture2DRect(tex, gl_TexCoord[7].xy);
 vec4 cru = texture2DRect(tex, TexRU.xy);
 vec4  cc = ccc;
 vec4  v1[4], v2[4];
 v1[0] = vec4(clc.g, ccc.g, ccd.b, ccc.b);
 v1[1] = vec4(ccc.r, crc.r, ccd.a, ccc.a);
 v1[2] = vec4(clc.a, ccc.a, ccc.r, ccu.r);
 v1[3] = vec4(ccc.b, crc.b, ccc.g, ccu.g);
 v2[0] = vec4(cld.a, clc.a, ccd.a, ccc.a);
 v2[1] = vec4(ccd.b, ccc.b, crd.b, crc.b);
 v2[2] = vec4(clc.g, clu.g, ccc.g, ccu.g);
 v2[3] = vec4(ccc.r, ccu.r, crc.r, cru.r);
 DEFINE_EXTRA();
 vec4 key = vec4(0.0); 
 #define KEYTEST_STEP0(i) \
 bvec4 test1 = greaterThan(vec4(cc[i]), max(v1[i], v2[i])), test2 = lessThan(vec4(cc[i]), min(v1[i], v2[i]));\
 key[i] = cc[i] > float(THRESHOLD0(i)) && all(test1)?1.0: 0.0;\
 key[i] = cc[i] < float(-THRESHOLD0(i)) && all(test2)? -1.0: key[i];\
 REPEAT4(KEYTEST_STEP0);
 if(gl_TexCoord[0].x < 1.0) {key.rb = vec2(0.0);}
 if(gl_TexCoord[0].y < 1.0) {key.rg = vec2(0.0);}
 gl_FragColor = vec4(0.0);
 if(any(notEqual(key, vec4(0.0)))) {
 float fxx[4], fyy[4], fxy[4], fx[4], fy[4];
 #define EDGE_SUPPRESION(i) \
 if(key[i] != 0.0)\
 vec4 D2 = v1[i].xyzw - cc[i];\
 vec2 D4 = v2[i].xw - v2[i].yz;\
 vec2 D5 = 0.5*(v1[i].yw-v1[i].xz); \
 fx[i] = D5.x;
 fy[i] = D5.y ;\
 fxx[i] = D2.x + D2.y;\
 fyy[i] = D2.z + D2.w;\
 fxy[i] = 0.25*(D4.x + D4.y);\
 float fxx_plus_fyy = fxx[i] + fyy[i];\
 float score_up = fxx_plus_fyy*fxx_plus_fyy; \
 float score_down = (fxx[i]*fyy[i] - fxy[i]*fxy[i]);\
 if( score_down <= 0.0 || score_up > THRESHOLD2 * score_down)key[i] = 0.0;\
 REPEAT4(EDGE_SUPPRESION);
 if(any(notEqual(key, vec4(0.0)))) {
 #define KEYTEST_STEP2(i)\
 if(key[i] == 1.0)\
 bvec4 test = lessThan(vec4(cc[i]), max(v5[i], v6[i]));\
 if(cc[i] < cd[i] || any(test))key[i] = 0.0; \
 }else if(key[i] == -1.0)\
 bvec4 test = greaterThan(vec4(cc[i]), min(v5[i], v6[i]));\
 if(cc[i] > cd[i] || any(test))key[i] = 0.0; \
 REPEAT4(KEYTEST_STEP2);
 float keysum = dot(abs(key), vec4(1, 1, 1, 1)) ;
 //assume there is only one keypoint in the four. 
 if(keysum==1.0) {
 ccc = texture2DRect(texD, gl_TexCoord[0].xy);
 clc = texture2DRect(texD, gl_TexCoord[1].xy);
 crc = texture2DRect(texD, gl_TexCoord[2].xy);
 ccd = texture2DRect(texD, gl_TexCoord[3].xy);
 ccu = texture2DRect(texD, gl_TexCoord[4].xy);
 cld = texture2DRect(texD, gl_TexCoord[5].xy);
 clu = texture2DRect(texD, gl_TexCoord[6].xy);
 crd = texture2DRect(texD, gl_TexCoord[7].xy);
 cru = texture2DRect(texD, TexRU.xy);
 vec4 cd = ccc;
 v5[0] = vec4(clc.g, ccc.g, ccd.b, ccc.b);
 v5[1] = vec4(ccc.r, crc.r, ccd.a, ccc.a);
 v5[2] = vec4(clc.a, ccc.a, ccc.r, ccu.r);
 v5[3] = vec4(ccc.b, crc.b, ccc.g, ccu.g);
 v6[0] = vec4(cld.a, clc.a, ccd.a, ccc.a);
 v6[1] = vec4(ccd.b, ccc.b, crd.b, crc.b);
 v6[2] = vec4(clc.g, clu.g, ccc.g, ccu.g);
 v6[3] = vec4(ccc.r, ccu.r, crc.r, cru.r);
 #define KEYTEST_STEP1(i)\
 if(key[i] == 1.0)\
 bvec4 test = lessThan(vec4(cc[i]), max(v4[i], v6[i])); \
 if(cc[i] < cu[i] || any(test))key[i] = 0.0; \
 }else if(key[i] == -1.0)\
 bvec4 test = greaterThan(vec4(cc[i]), min(v4[i], v6[i])); \
 if(cc[i] > cu[i] || any(test) )key[i] = 0.0; \
 REPEAT4(KEYTEST_STEP1);
 if(any(notEqual(key, vec4(0.0)))) { 
 vec4  v4[4], v5[4], v6[4];
 ccc = texture2DRect(texU, gl_TexCoord[0].xy);
 clc = texture2DRect(texU, gl_TexCoord[1].xy);
 crc = texture2DRect(texU, gl_TexCoord[2].xy);
 ccd = texture2DRect(texU, gl_TexCoord[3].xy);
 ccu = texture2DRect(texU, gl_TexCoord[4].xy);
 cld = texture2DRect(texU, gl_TexCoord[5].xy);
 clu = texture2DRect(texU, gl_TexCoord[6].xy);
 crd = texture2DRect(texU, gl_TexCoord[7].xy);
 cru = texture2DRect(texU, TexRU.xy);
 vec4 cu = ccc;
 v4[0] = vec4(clc.g, ccc.g, ccd.b, ccc.b);
 v4[1] = vec4(ccc.r, crc.r, ccd.a, ccc.a);
 v4[2] = vec4(clc.a, ccc.a, ccc.r, ccu.r);
 v4[3] = vec4(ccc.b, crc.b, ccc.g, ccu.g);
 v6[0] = vec4(cld.a, clc.a, ccd.a, ccc.a);
 v6[1] = vec4(ccd.b, ccc.b, crd.b, crc.b);
 v6[2] = vec4(clc.g, clu.g, ccc.g, ccu.g);
 v6[3] = vec4(ccc.r, ccu.r, crc.r, cru.r);
 float keyv = dot(key, vec4(1.0, 2.0, 3.0, 4.0));
 gl_FragColor = vec4(keyv,  offset);
 }}}}
 float fs = 0.5*( cu[0] - cd[0] );
 float fss = cu[0] + cd[0] - cc[0] - cc[0];
 float fxs = 0.25 * (v4[0].y + v5[0].x - v4[0].x - v5[0].y);
 float fys = 0.25 * (v4[0].w + v5[0].z - v4[0].z - v5[0].w);
 vec4 A0, A1, A2 ;
 A0 = vec4(fxx[0], fxy[0], fxs, -fx[0]);
 A1 = vec4(fxy[0], fyy[0], fys, -fy[0]);
 A2 = vec4(fxs, fys, fss, -fs);
 vec3 x3 = abs(vec3(fxx[0], fxy[0], fxs));
 float maxa = max(max(x3.x, x3.y), x3.z);
 if(maxa >= 1e-10 ) 
 if(x3.y ==maxa )
 vec4 TEMP = A1; A1 = A0; A0 = TEMP;
 }else if( x3.z == maxa )
 vec4 TEMP = A2; A2 = A0; A0 = TEMP;
 A0 /= A0.x;
 A1 -= A1.x * A0;
 A2 -= A2.x * A0;
 vec2 x2 = abs(vec2(A1.y, A2.y));
 if( x2.y > x2.x )
 vec3 TEMP = A2.yzw;
 A2.yzw = A1.yzw;
 A1.yzw = TEMP;
 x2.x = x2.y;
 if(x2.x >= 1e-10) {
 A1.yzw /= A1.y;
 A2.yzw -= A2.y * A1.yzw;
 if(abs(A2.z) >= 1e-10) {
 offset.z = A2.w /A2.z;
    
 offset.y = A1.w - offset.z*A1.z;
    
 offset.x = A0.w - offset.z*A0.z - offset.y*A0.y;
 bool test = (abs(cc[0] + 0.5*dot(vec3(fx[0], fy[0], fs), offset ))>float(THRESHOLD1)) ;
 if(!test || any( greaterThan(abs(offset), vec3(1.0)))) key = vec4(0.0);
 vec3 offset = vec3(0.0, 0.0, 0.0); 
 #define TESTMOVE_KEYPOINT(idx) \
 if(key[idx] != 0.0) \
 cu[0] = cu[idx];
 cd[0] = cd[idx];
 cc[0] = cc[idx];
 v4[0] = v4[idx];
 v5[0] = v5[idx];
 fxy[0] = fxy[idx];
 fxx[0] = fxx[idx];
 fyy[0] = fyy[idx];
 fx[0] = fx[idx];
 fy[0] = fy[idx];
 MOVE_EXTRA(idx);  \
 TESTMOVE_KEYPOINT(1);
 TESTMOVE_KEYPOINT(2);
 TESTMOVE_KEYPOINT(3);
 float keyv = dot(key, vec4(1.0, 2.0, 3.0, 4.0));
 gl_FragColor =  vec4(keyv, 0.0, 0.0, 0.0);
 }}}}
 texI
 #define M_PI 3.14159265358979323846
 #define TWO_PI (2.0*M_PI)
 #define RPI 1.2732395447351626861510701069801
 #define WF size.z
 uniform sampler2DRect tex;
 uniform sampler2DRect gtex;
 uniform sampler2DRect otex;
 uniform vec4
 dsize;
 uniform vec3
 size;
 void main()
 vec2 dim
 = size.xy;
 //image size
 float index = dsize.x*floor(gl_TexCoord[0].y * 0.5) + gl_TexCoord[0].x;
 float idx = 8.0* fract(index * 0.125) + 8.0 * floor(2.0* fract(gl_TexCoord[0].y * 0.5));
 index = floor(index*0.125)+ 0.49;  
 vec2 coord = floor( vec2( mod(index, dsize.z), index*dsize.w)) + 0.5 ;
 vec2 pos = texture2DRect(tex, coord).xy;
 vec2 wsz = texture2DRect(tex, coord).zw;
   float aspect_ratio = wsz.y / wsz.x;
   float aspect_sq = aspect_ratio * aspect_ratio; 
 vec2 spt  = wsz * 0.25; vec2 ispt = 1.0 / spt; 
 vec4 temp; vec2 pt;
 pt.x = pos.x + fract(idx*0.25) * wsz.x;
 pt.y = pos.y + (floor(idx*0.25) + 0.5) * spt.y;
 vec4 sz;
 sz.xy = max(pt - spt, vec2(2,2));
 sz.zw = min(pt + spt, dim - vec2(3));
 sz = floor(sz * 0.5)+0.5;
 vec4 DA, DB;   vec2 spos;
 DA = DB  = vec4(0.0, 0.0, 0.0, 0.0);
 vec4 nox = vec4(0.0, 1.0, 0.0, 1.0);
 vec4 noy = vec4(0.0, 0.0, 1.0, 1.0);
 for(spos.y = sz.y; spos.y <= sz.w;
 spos.y+=1.0)
 for(spos.x = sz.x; spos.x <= sz.z;
 spos.x+=1.0)
 vec2 tpt = spos * 2.0 - pt - 0.5;
 vec4 nx = (tpt.x + nox) * ispt.x;
 vec4 ny = (tpt.y + noy) * ispt.y;
 vec4 nxn = abs(nx), nyn = abs(ny);
 bvec4 inside = lessThan(max(nxn, nyn) , vec4(1.0));
 if(any(inside))
 vec4 gg = texture2DRect(gtex, spos);
 vec4 oo = texture2DRect(otex, spos);
 vec4 theta0 = (- oo)*RPI;
 vec4 theta = 8.0 * fract(1.0 + 0.125 * theta0);
 vec4 theta1 = floor(theta);
 vec4 weight = (vec4(1) - nxn) * (vec4(1) - nyn) * gg; 
 vec4 weight2 = (theta - theta1) * weight;
 vec4 weight1 = weight - weight2;
 #define ADD_DESCRIPTOR(i) \
 if(inside[i])\
 DA += vec4(equal(vec4(theta1[i]), vec4(0, 1, 2, 3)))*weight1[i]; \
 DA += vec4(equal(vec4(theta1[i]), vec4(7, 0, 1, 2)))*weight2[i]; \
 DB += vec4(equal(vec4(theta1[i]), vec4(4, 5, 6, 7)))*weight1[i]; \
 DB += vec4(equal(vec4(theta1[i]), vec4(3, 4, 5, 6)))*weight2[i]; \
 REPEAT4(ADD_DESCRIPTOR);
 #define M_PI 3.14159265358979323846
 #define TWO_PI (2.0*M_PI)
 #define RPI 1.2732395447351626861510701069801
 #define WF size.z
 uniform sampler2DRect tex;
 uniform sampler2DRect gtex;
 uniform sampler2DRect otex;
 uniform vec4
 dsize;
 uniform vec3
 size;
 void main()
 vec2 dim
 = size.xy;
 //image size
 float index = dsize.x*floor(gl_TexCoord[0].y * 0.5) + gl_TexCoord[0].x;
 float idx = 8.0* fract(index * 0.125) + 8.0 * floor(2.0* fract(gl_TexCoord[0].y * 0.5));
 index = floor(index*0.125)+ 0.49;  
 vec2 coord = floor( vec2( mod(index, dsize.z), index*dsize.w)) + 0.5 ;
 vec2 pos = texture2DRect(tex, coord).xy;
 if(any(lessThan(pos.xy, vec2(1.0))) || any(greaterThan(pos.xy, dim-1.0))) 
 //discard;
 { gl_FragData[0] = gl_FragData[1] = vec4(0.0); return; }
 float anglef = texture2DRect(tex, coord).z;
 if(anglef > M_PI) anglef -= TWO_PI;
 float sigma = texture2DRect(tex, coord).w; 
 float spt  = abs(sigma * WF);
 //default to be 3*sigma
 vec4 cscs, rots;
 cscs.x = cos(anglef); cscs.y = sin(anglef);
 cscs.zw = - cscs.xy;
 rots = cscs /spt;
 cscs *= spt; 
 vec4 temp; vec2 pt, offsetpt;
 /*the fraction part of idx is .5*/
 offsetpt.x = 4.0* fract(idx*0.25) - 2.0;
 offsetpt.y = floor(idx*0.25) - 1.5;
 temp = cscs.xwyx*offsetpt.xyxy;
 pt = pos + temp.xz + temp.yw;
 vec2 bwin = abs(cscs.xy);
 float bsz = bwin.x + bwin.y;
 vec4 sz;
 sz.xy = max(pt - vec2(bsz), vec2(2,2));
 sz.zw = min(pt + vec2(bsz), dim - vec2(3));
 sz = floor(sz * 0.5)+0.5;
 vec4 DA, DB;   vec2 spos;
 DA = DB  = vec4(0.0, 0.0, 0.0, 0.0);
 vec4 nox = vec4(0.0, rots.xy, rots.x + rots.y);
 vec4 noy = vec4(0.0, rots.wx, rots.w + rots.x);
 for(spos.y = sz.y; spos.y <= sz.w;
 spos.y+=1.0)
 for(spos.x = sz.x; spos.x <= sz.z;
 spos.x+=1.0)
 vec2 tpt = spos * 2.0 - pt - 0.5;
 vec4 temp = rots.xywx * tpt.xyxy;
 vec2 temp2 = temp.xz + temp.yw;
 vec4 nx = temp2.x + nox;
 vec4 ny = temp2.y + noy;
 vec4 nxn = abs(nx), nyn = abs(ny);
 bvec4 inside = lessThan(max(nxn, nyn) , vec4(1.0));
 if(any(inside))
 vec4 gg = texture2DRect(gtex, spos);
 vec4 oo = texture2DRect(otex, spos);
 vec4 theta0 = (anglef - oo)*RPI;
 vec4 theta = 8.0 * fract(1.0 + 0.125 * theta0);
 vec4 theta1 = floor(theta);
 vec4 diffx = nx + offsetpt.x, diffy = ny + offsetpt.y;
 vec4 ww = exp(-0.125 * (diffx * diffx + diffy * diffy ));
 vec4 weight = (vec4(1) - nxn) * (vec4(1) - nyn) * gg * ww; 
 vec4 weight2 = (theta - theta1) * weight;
 vec4 weight1 = weight - weight2;
 #define ADD_DESCRIPTOR(i) \
 if(inside[i])\
 DA += vec4(equal(vec4(theta1[i]), vec4(0, 1, 2, 3)))*weight1[i]; \
 DA += vec4(equal(vec4(theta1[i]), vec4(7, 0, 1, 2)))*weight2[i]; \
 DB += vec4(equal(vec4(theta1[i]), vec4(4, 5, 6, 7)))*weight1[i]; \
 DB += vec4(equal(vec4(theta1[i]), vec4(3, 4, 5, 6)))*weight2[i]; \
 REPEAT4(ADD_DESCRIPTOR);
	__kernel void FindBestMatchOneWay_Kernel(__global int result, __global VECTOR_TYPE des1, __global VECTOR_TYPE *des2, int num1, int num2, float distmax, float ratiomax);
	__kernel void FindGuidedBestMatchOneWay_Kernel(__global int result, __global VECTOR_TYPE des1, __global VECTOR_TYPE des2, __global float loc1, __global float *loc2, int num1, int num2, int guided, // 0 = Fundamental matrix, first pass;
	__kernel void ComputeInitialCostKernel(__global float cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float depth_map, int depth_map_pitch, __global float normal_map, int normal_map_height, int normal_map_depth, int normal_map_pitch, __global float ref_sum_image, int ref_sum_pitch, __global float *ref_squared_sum_image, int ref_squared_sum_pitch, float sigma_spatial, float sigma_color, image2d_array_t ref_image, ;
	__kernel void InitRandomStateKernel(__global clrandState *output, int width, int height, uint pitch);
	__kernel void ComputeBackwardMessagesKernel(__global float global_workspace, int global_workspace_height, __global float cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float *sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, float ncc_sigma);
	__kernel void HypothesesAndProbsKernel(int row, __global float global_workspace, int global_workspace_width, int global_workspace_height, __global float hypotheses_workspace, int hypotheses_workspace_height, __global clrandState rand_state_map, __global float cost_map, int cost_map_width, int cost_map_height, int cost_map_depth, int cost_map_pitch, __global float depth_map, int depth_map_pitch, __global float normal_map, int normal_map_height, int normal_map_pitch, __global float sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float min_triangulation_angle, float incident_angle_sigma, float ncc_sigma, float perturbation, float prev_sel_prob_weight, image2d_array_t poses_images, ;
	__kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float hypotheses_workspace, __global float pcc_costs, __global float ref_sum_image, int ref_sum_pitch, __global float ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, ;
	__kernel void PccComputeKernel(float spatial_normalization_, float color_normalization_, int row, int width, int workspaceWidth, __global float hypotheses_workspace, __global float pcc_costs, __global float ref_sum_image, int ref_sum_pitch, __global float ref_squared_sum_image, int ref_squared_sum_pitch, image2d_array_t ref_image, image2d_array_t src_images, ;
	__kernel void MinimumCostKernel(int row, int width, int num_src_images, int num_hypotheses, int kGeomConsistencyTerm, __global float global_workspace, int workspaceWidth, __global float pcc_costs, __global float hypotheses_workspace, __global float cost_map, int cost_map_height, int cost_map_pitch, __global float depth_map, int depth_map_pitch, __global float normal_map, int normal_map_height, int normal_map_pitch, __global float sel_prob_map, int sel_prob_map_height, int sel_prob_map_pitch, __global float prev_sel_prob_map, int prev_sel_prob_map_height, int prev_sel_prob_map_pitch, float prev_sel_prob_weight, float ncc_sigma, float geom_consistency_regularizer, float geom_consistency_max_cost, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float *ref_inv_K, ifdef HYPOTHESES_STATISTICS;
	__kernel void ConsistencyFilterKernel(int width, __global float depth_map, __global float normal_map, __global float sel_prob_map, int map_pitch, int map_height, int num_src_images, int kFilterConsistencyMode, __global uchar consistency_mask, int consistency_mask_pitch, image2d_array_t src_depth_map_images, image2d_array_t poses_images, __constant float ref_inv_K, __constant float ref_K, float ncc_sigma, float filter_min_ncc, float filter_min_triangulation_angle, float geom_consistency_max_cost, float filter_geom_consistency_max_cost, int filter_min_num_consistent);
	__kernel void RotateFloatKernel(__global float input_data, int input_offset, __global float output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
	__kernel void RotateUint8Kernel(__global uchar input_data, int input_offset, __global uchar output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
	__kernel void RotateFloatKernel(__global float * input_data, int input_offset, __global float * output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
	__kernel void RotateUint8Kernel(__global uchar input_data, int input_offset, __global uchar output_data, int output_offset, const int width, const int height, const int input_pitch, const int output_pitch);
	__kernel void FillWithRandomNumbersKernel(__global float* output, __global clrandState * rand_state_map, float min_value, float max_value, int random_state_width, int random_state_height, int random_state_pitch, int output_height, int output_depth, int output_pitch) {;
	__kernel void InitNormalMapKernel(__global float* normal_map, __global clrandState * rand_state_map, int normal_map_width, int normal_map_height, ;
	__kernel void FilterKernel(image2d_array_t image_array, __global uchar image, int pitch_image, __global float sum_image, int pitch_sum_image, __global float *squared_sum_image, int pitch_squared_sum_image, int window_radius, int window_step, float sigma_spatial, float sigma_color) {;
	__kernel void RotateNormalMapKernel(__global float * normal_map, int normal_map_width, int normal_map_height, int normal_map_depth, int normal_map_pitch) {;
	__kernel void CopyFromGpuMatCLKernel(__global uchar * input_array, __write_only image2d_array_t output_array, int pitch, int width, int height) {;
No results found