Artem-B · September 20, 2021 23:51
diff --git a/__clang_cuda_texture_intrinsics.h b/__clang_cuda_texture_intrinsics.h

 namespace {

 template <int N>
 struct __Tag;
 # 54 "__clang_cuda_texture_intrinsics.h" 3
 template <class>
 struct __FT;

 template <>
 struct __FT<float> {
  using __bt = float;
  using __ft = float4;
 };
 template <>
 struct __FT<char> {
  using __bt = char;
  using __ft = int4;
 };
 template <>
 struct __FT<signed char> {
  using __bt = signed char;
  using __ft = int4;
 };
 template <>
 struct __FT<unsigned char> {
  using __bt = unsigned char;
  using __ft = uint4;
 };
 template <>
 struct __FT<short> {
  using __bt = short;
  using __ft = int4;
 };
 template <>
 struct __FT<ushort> {
  using __bt = ushort;
  using __ft = uint4;
 };
 template <>
 struct __FT<int> {
  using __bt = int;
  using __ft = int4;
 };
 template <>
 struct __FT<uint> {
  using __bt = uint;
  using __ft = uint4;
 };

 template <class __T>
 struct __FT {
  using __bt = decltype(__T::x);
  using __ft = typename __FT<__bt>::__ft;
 };

 template <class __op>
 struct __tex_fetch_v4;
 template <>
 struct __tex_fetch_v4<__Tag<-1>>;
 # 192 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1D_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj,
                                           float __x) {
    int4 __r;
    asm("tex.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x) {
    uint4 __r;
    asm("tex.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1D_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1Dfetch_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, int __x);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, int __x) {
    int4 __r;
    asm("tex.1d.v4"
        ".s32."
        "s32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             int __x) {
    uint4 __r;
    asm("tex.1d.v4"
        ".u32."
        "s32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               int __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".f32."
        "s32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__x));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1Dfetch_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              int __x);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             int __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".s32."
        "s32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              int __x) {
    float4 __r;
    asm("tex.1d.v4"
        ".u32."
        "s32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__x));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1D")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1D_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1Dfetch")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1Dfetch_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __dPdx, float __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __dPdx, float __dPdy) {
    int4 __r;
    asm("tex.grad.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __dPdx,
                                             float __dPdy) {
    uint4 __r;
    asm("tex.grad.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __dPdx,
                                               float __dPdy) {
    float4 __r;
    asm("tex.grad.1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __dPdx,
                                              float __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __dPdx,
                                             float __dPdy) {
    float4 __r;
    asm("tex.grad.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __dPdx,
                                              float __dPdy) {
    float4 __r;
    asm("tex.grad.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1DGrad")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DGrad_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayered_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         int __layer);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           int __layer) {
    int4 __r;
    asm("tex.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, int __layer) {
    uint4 __r;
    asm("tex.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, int __layer) {
    float4 __r;
    asm("tex.a1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayered_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, int __layer);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, int __layer) {
    float4 __r;
    asm("tex.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, int __layer) {
    float4 __r;
    asm("tex.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1DLayered")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayered_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         int __layer, float __dPdx,
                                         float __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           int __layer, float __dPdx,
                                           float __dPdy) {
    int4 __r;
    asm("tex.grad.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, int __layer,
                                             float __dPdx, float __dPdy) {
    uint4 __r;
    asm("tex.grad.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, int __layer,
                                               float __dPdx, float __dPdy) {
    float4 __r;
    asm("tex.grad.a1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, int __layer,
                                              float __dPdx, float __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, int __layer,
                                             float __dPdx, float __dPdy) {
    float4 __r;
    asm("tex.grad.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, int __layer,
                                              float __dPdx, float __dPdy) {
    float4 __r;
    asm("tex.grad.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1DLayeredGrad")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredGrad_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         int __layer, float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           int __layer, float __level) {
    int4 __r;
    asm("tex.level.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, int __layer,
                                             float __level) {
    uint4 __r;
    asm("tex.level.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, int __layer,
                                               float __level) {
    float4 __r;
    asm("tex.level.a1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, int __layer,
                                              float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, int __layer,
                                             float __level) {
    float4 __r;
    asm("tex.level.a1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, int __layer,
                                              float __level) {
    float4 __r;
    asm("tex.level.a1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1DLayeredLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLayeredLod_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __level) {
    int4 __r;
    asm("tex.level.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], %6;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __level) {
    uint4 __r;
    asm("tex.level.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], %6;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __level) {
    float4 __r;
    asm("tex.level.1d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], %6;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __level) {
    float4 __r;
    asm("tex.level.1d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], %6;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __level) {
    float4 __r;
    asm("tex.level.1d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5}], %6;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex1DLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex1DLod_v2")>> {};

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2D_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y) {
    int4 __r;
    asm("tex.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y) {
    uint4 __r;
    asm("tex.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y) {
    float4 __r;
    asm("tex.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2D_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y) {
    float4 __r;
    asm("tex.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y) {
    float4 __r;
    asm("tex.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2D")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2D_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2D_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        " selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        " selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        " selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, const float2* __dPdx,
                                         const float2* __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, const float2* __dPdx,
                                           const float2* __dPdy) {
    int4 __r;
    asm("tex.grad.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             const float2* __dPdx,
                                             const float2* __dPdy) {
    uint4 __r;
    asm("tex.grad.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               const float2* __dPdx,
                                               const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              const float2* __dPdx,
                                              const float2* __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             const float2* __dPdx,
                                             const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              const float2* __dPdx,
                                              const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DGrad_v2")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DGrad_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DGrad_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, const float2* __dPdx,
                                         const float2* __dPdy,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, const float2* __dPdx,
                                           const float2* __dPdy,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             const float2* __dPdx,
                                             const float2* __dPdy,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               const float2* __dPdx,
                                               const float2* __dPdy,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
          "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };
 # 259 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayered_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer) {
    int4 __r;
    asm("tex.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             int __layer) {
    uint4 __r;
    asm("tex.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __layer) {
    float4 __r;
    asm("tex.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayered_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __layer);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             int __layer) {
    float4 __r;
    asm("tex.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __layer) {
    float4 __r;
    asm("tex.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayered")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayered_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayered_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __layer,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y));
    return __r;
  }
 };
 # 273 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer,
                                         const float2* __dPdx,
                                         const float2* __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer,
                                           const float2* __dPdx,
                                           const float2* __dPdy) {
    int4 __r;
    asm("tex.grad.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             const float2* __dPdx,
                                             const float2* __dPdy) {
    uint4 __r;
    asm("tex.grad.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __layer,
                                               const float2* __dPdx,
                                               const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __layer,
                                              const float2* __dPdx,
                                              const float2* __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             const float2* __dPdx,
                                             const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, int __layer,
                                              const float2* __dPdx,
                                              const float2* __dPdy) {
    float4 __r;
    asm("tex.grad.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayeredGrad_v2")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredGrad_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayeredGrad_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer,
                                         const float2* __dPdx,
                                         const float2* __dPdy,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer,
                                           const float2* __dPdx,
                                           const float2* __dPdy,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, "
        "%12};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             const float2* __dPdx,
                                             const float2* __dPdy,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, "
        "%12};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(
      cudaTextureObject_t __obj, float __x, float __y, int __layer,
      const float2* __dPdx, const float2* __dPdy, unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, "
        "%12};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdy->x), "f"(__dPdy->y));
    return __r;
  }
 };
 # 293 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer, float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer,
                                           float __level) {
    int4 __r;
    asm("tex.level.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             float __level) {
    uint4 __r;
    asm("tex.level.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __layer, float __level) {
    float4 __r;
    asm("tex.level.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __layer,
                                              float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             float __level) {
    float4 __r;
    asm("tex.level.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, int __layer,
                                              float __level) {
    float4 __r;
    asm("tex.level.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayeredLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLayeredLod_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLayeredLod_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __layer, float __level,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __layer,
                                           float __level, unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.a2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __layer,
                                             float __level,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.a2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __layer, float __level,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.a2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };
 # 308 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __level) {
    int4 __r;
    asm("tex.level.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             float __level) {
    uint4 __r;
    asm("tex.level.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               float __level) {
    float4 __r;
    asm("tex.level.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             float __level) {
    float4 __r;
    asm("tex.level.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              float __level) {
    float4 __r;
    asm("tex.level.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}], %7;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2DLod_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2DLod_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __level,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __level,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             float __level,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               float __level,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}], %8;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__level));
    return __r;
  }
 };
 # 339 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 0>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __comp) {
    int4 __r;
    asm("tld4.r.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    uint4 __r;
    asm("tld4.r.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __comp) {
    float4 __r;
    asm("tld4.r.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 + 0>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __comp);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    float4 __r;
    asm("tld4.r.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __comp) {
    float4 __r;
    asm("tld4.r.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather") * 100 + 0>>
    : __tex_fetch_v4<
          __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 0>> {};
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 + 0>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, unsigned char* __ir,
                                         int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir,
                                           int __comp) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.r.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             unsigned char* __ir, int __comp) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.r.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               unsigned char* __ir,
                                               int __comp) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.r.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 ;
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 1>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __comp) {
    int4 __r;
    asm("tld4.g.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    uint4 __r;
    asm("tld4.g.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __comp) {
    float4 __r;
    asm("tld4.g.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 + 1>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __comp);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    float4 __r;
    asm("tld4.g.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __comp) {
    float4 __r;
    asm("tld4.g.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather") * 100 + 1>>
    : __tex_fetch_v4<
          __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 1>> {};
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 + 1>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, unsigned char* __ir,
                                         int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir,
                                           int __comp) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.g.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             unsigned char* __ir, int __comp) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.g.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               unsigned char* __ir,
                                               int __comp) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.g.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 ;
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 2>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __comp) {
    int4 __r;
    asm("tld4.b.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    uint4 __r;
    asm("tld4.b.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __comp) {
    float4 __r;
    asm("tld4.b.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 + 2>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __comp);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    float4 __r;
    asm("tld4.b.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __comp) {
    float4 __r;
    asm("tld4.b.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather") * 100 + 2>>
    : __tex_fetch_v4<
          __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 2>> {};
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 + 2>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, unsigned char* __ir,
                                         int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir,
                                           int __comp) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.b.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             unsigned char* __ir, int __comp) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.b.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               unsigned char* __ir,
                                               int __comp) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.b.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 ;
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 3>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, int __comp) {
    int4 __r;
    asm("tld4.a.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    uint4 __r;
    asm("tld4.a.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               int __comp) {
    float4 __r;
    asm("tld4.a.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 + 3>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, int __comp);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, int __comp) {
    float4 __r;
    asm("tld4.a.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __comp) {
    float4 __r;
    asm("tld4.a.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather") * 100 + 3>>
    : __tex_fetch_v4<
          __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 + 3>> {};
 template <>
 struct __tex_fetch_v4<
    __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 + 3>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, unsigned char* __ir,
                                         int __comp);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir,
                                           int __comp) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.a.2d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y,
                                             unsigned char* __ir, int __comp) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.a.2d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               unsigned char* __ir,
                                               int __comp) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tld4.a.2d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y));
    return __r;
  }
 };
 ;

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2Dgather_v2")>> {
  template <class __T>
  __attribute__((device)) static __T __run(cudaTextureObject_t __obj, float __x,
                                           float __y, int __comp) {
    switch (__comp) {
      case 0:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 +
                  0>>::__run<__T>(__obj, __x, __y, __comp);
      case 1:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 +
                  1>>::__run<__T>(__obj, __x, __y, __comp);
      case 2:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 +
                  2>>::__run<__T>(__obj, __x, __y, __comp);
      case 3:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_v2") * 100 +
                  3>>::__run<__T>(__obj, __x, __y, __comp);
    }
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2Dgather")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2Dgather_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex2Dgather_rmnf_v2")>> {
  template <class __T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y,
                                              int __comp) {
    switch (__comp) {
      case 0:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 +
                  0>>::__run<__T>(__obj, __x, __y, __comp);
      case 1:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 +
                  1>>::__run<__T>(__obj, __x, __y, __comp);
      case 2:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 +
                  2>>::__run<__T>(__obj, __x, __y, __comp);
      case 3:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__tex2Dgather_rmnf_v2") * 100 +
                  3>>::__run<__T>(__obj, __x, __y, __comp);
    }
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex2Dgather_sparse")>> {
  template <class __T>
  __attribute__((device)) static __T __run(cudaTextureObject_t __obj, float __x,
                                           float __y, unsigned char* __ir,
                                           int __comp) {
    switch (__comp) {
      case 0:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 +
                  0>>::__run<__T>(__obj, __x, __y, __ir, __comp);
      case 1:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 +
                  1>>::__run<__T>(__obj, __x, __y, __ir, __comp);
      case 2:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 +
                  2>>::__run<__T>(__obj, __x, __y, __ir, __comp);
      case 3:
        return __tex_fetch_v4<
            __Tag<10000 + __nvvm_texture_op("__itex2Dgather_sparse") * 100 +
                  3>>::__run<__T>(__obj, __x, __y, __ir, __comp);
    }
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3D_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z) {
    int4 __r;
    asm("tex.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z) {
    uint4 __r;
    asm("tex.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               float __z) {
    float4 __r;
    asm("tex.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3D_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z) {
    float4 __r;
    asm("tex.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z) {
    float4 __r;
    asm("tex.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3D")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3D_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3D_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };
 # 420 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z,
                                         const float4* __dPdx,
                                         const float4* __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           const float4* __dPdx,
                                           const float4* __dPdy) {
    int4 __r;
    asm("tex.grad.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             const float4* __dPdx,
                                             const float4* __dPdy) {
    uint4 __r;
    asm("tex.grad.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               const float4* __dPdx,
                                               const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              const float4* __dPdx,
                                              const float4* __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             const float4* __dPdx,
                                             const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              const float4* __dPdx,
                                              const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3DGrad_v2")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DGrad_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3DGrad_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z,
                                         const float4* __dPdx,
                                         const float4* __dPdy,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           const float4* __dPdx,
                                           const float4* __dPdy,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             const float4* __dPdx,
                                             const float4* __dPdy,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               const float4* __dPdx,
                                               const float4* __dPdy,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.grad.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
 };
 # 441 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           float __level) {
    int4 __r;
    asm("tex.level.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             float __level) {
    uint4 __r;
    asm("tex.level.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               float __level) {
    float4 __r;
    asm("tex.level.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             float __level) {
    float4 __r;
    asm("tex.level.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              float __level) {
    float4 __r;
    asm("tex.level.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3DLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__tex3DLod_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itex3DLod_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, float __level,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z, float __level,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.3d.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             float __level,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.3d.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               float __level,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.level.3d.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };
 # 457 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemap_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z) {
    int4 __r;
    asm("tex.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z) {
    uint4 __r;
    asm("tex.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y,
                                               float __z) {
    float4 __r;
    asm("tex.cube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemap_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z) {
    float4 __r;
    asm("tex.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z) {
    float4 __r;
    asm("tex.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemap")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemap_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemap_sparse")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z,
                                         unsigned char* __ir);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           unsigned char* __ir) {
    int4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             unsigned char* __ir) {
    uint4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               unsigned char* __ir) {
    float4 __r;
    asm("{.reg .pred %%p0;\n\t"
        "tex.cube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
        "selp.u16 %4, 1, 0, %%p0; }"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w),
          "=h"(*__ir)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z,
                                         const float4* __dPdx,
                                         const float4* __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           const float4* __dPdx,
                                           const float4* __dPdy) {
    int4 __r;
    asm("tex.grad.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             const float4* __dPdx,
                                             const float4* __dPdy) {
    uint4 __r;
    asm("tex.grad.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               const float4* __dPdx,
                                               const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.cube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              const float4* __dPdx,
                                              const float4* __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             const float4* __dPdx,
                                             const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              const float4* __dPdx,
                                              const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
        "{%8, %9, %10, %10}, {%11, %12, %13, %13};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
          "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
          "f"(__dPdy->z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemapGrad_v2")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapGrad_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayered_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, int __layer);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z, int __layer) {
    int4 __r;
    asm("tex.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer) {
    uint4 __r;
    asm("tex.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               int __layer) {
    float4 __r;
    asm("tex.acube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayered_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer) {
    float4 __r;
    asm("tex.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer) {
    float4 __r;
    asm("tex.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemapLayered")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayered_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayeredGrad_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, int __layer,
                                         const float4* __dPdx,
                                         const float4* __dPdy);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z, int __layer,
                                           const float4* __dPdx,
                                           const float4* __dPdy) {
    int4 __r;
    asm("tex.grad.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z),
          "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x),
          "f"(__dPdy->y), "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer, const float4* __dPdx,
                                             const float4* __dPdy) {
    uint4 __r;
    asm("tex.grad.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z),
          "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x),
          "f"(__dPdy->y), "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               int __layer,
                                               const float4* __dPdx,
                                               const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.acube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z),
          "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x),
          "f"(__dPdy->y), "f"(__dPdy->z));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<__nvvm_texture_op("__texCubemapLayeredGrad_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer, const float4* __dPdx,
                                              const float4* __dPdy);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer, const float4* __dPdx,
                                             const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z),
          "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x),
          "f"(__dPdy->y), "f"(__dPdy->z));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer, const float4* __dPdx,
                                              const float4* __dPdy) {
    float4 __r;
    asm("tex.grad.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
        "{%9, %10, %11, %11}, {%12, %13, %14, %14};"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z),
          "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x),
          "f"(__dPdy->y), "f"(__dPdy->z));
    return __r;
  }
 };
 # 493 "__clang_cuda_texture_intrinsics.h" 3
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemapLayeredGrad_v2")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayeredGrad_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayeredLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, int __layer,
                                         float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z, int __layer,
                                           float __level) {
    int4 __r;
    asm("tex.level.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer, float __level) {
    uint4 __r;
    asm("tex.level.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               int __layer, float __level) {
    float4 __r;
    asm("tex.level.acube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<
    __Tag<__nvvm_texture_op("__texCubemapLayeredLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer, float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             int __layer, float __level) {
    float4 __r;
    asm("tex.level.acube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              int __layer, float __level) {
    float4 __r;
    asm("tex.level.acube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemapLayeredLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLayeredLod_v2")>> {};
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLod_v2")>> {
  template <class T>
  __attribute__((device)) static T __run(cudaTextureObject_t __obj, float __x,
                                         float __y, float __z, float __level);
  template <>
  __attribute__((device)) int4 __run<int4>(cudaTextureObject_t __obj, float __x,
                                           float __y, float __z,
                                           float __level) {
    int4 __r;
    asm("tex.level.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) uint4 __run<uint4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             float __level) {
    uint4 __r;
    asm("tex.level.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<float4>(cudaTextureObject_t __obj,
                                               float __x, float __y, float __z,
                                               float __level) {
    float4 __r;
    asm("tex.level.cube.v4"
        ".f32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "f"(__r.x),
          "="
          "f"(__r.y),
          "="
          "f"(__r.z),
          "="
          "f"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };
 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLod_rmnf_v2")>> {
  template <class T>
  __attribute__((device)) static float4 __run(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              float __level);
  template <>
  __attribute__((device)) float4 __run<int4>(cudaTextureObject_t __obj,
                                             float __x, float __y, float __z,
                                             float __level) {
    float4 __r;
    asm("tex.level.cube.v4"
        ".s32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
  template <>
  __attribute__((device)) float4 __run<uint4>(cudaTextureObject_t __obj,
                                              float __x, float __y, float __z,
                                              float __level) {
    float4 __r;
    asm("tex.level.cube.v4"
        ".u32."
        "f32"
        "\t"
        "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;"
        : "="
          "r"(__r.x),
          "="
          "r"(__r.y),
          "="
          "r"(__r.z),
          "="
          "r"(__r.w)
        : "l"(__obj), "f"(__x), "f"(__y), "f"(__z), "f"(__level));
    return __r;
  }
 };

 template <>
 struct __tex_fetch_v4<__Tag<__nvvm_texture_op("__itexCubemapLod")>>
    : __tex_fetch_v4<__Tag<__nvvm_texture_op("__texCubemapLod_v2")>> {};

 template <class __DestT, class __SrcT>
 struct __convert {
  template <bool __IsConvertible = std::is_convertible<__DestT, __SrcT>::value,
            int __N = sizeof(__DestT) / sizeof(typename __FT<__DestT>::__bt)>
  __attribute__((device)) static __DestT __run(__SrcT __v) {
    return __v;
  }
  template <>
  __attribute__((device)) static __DestT __run<false, 1>(__SrcT __v) {
    return {__v.x};
  }
  template <>
  __attribute__((device)) static __DestT __run<false, 2>(__SrcT __v) {
    return {__v.x, __v.y};
  }
  template <>
  __attribute__((device)) static __DestT __run<false, 3>(__SrcT __v) {
    return {__v.x, __v.y, __v.z};
  }
  template <>
  __attribute__((device)) static __DestT __run<false, 4>(__SrcT __v) {
    return {__v.x, __v.y, __v.z, __v.w};
  }
 };

 template <class __op, class __T, class... __Args>
 __attribute__((device)) static void __tex_fetch(__T* __ptr,
                                                cudaTextureObject_t __handle,
                                                __Args... __args) {
  using __FT = typename __FT<__T>::__ft;
  *__ptr = __convert<__T, __FT>::__run(
      __tex_fetch_v4<__op>::template __run<__FT>(__handle, __args...));
 }

 template <class __T>
 __attribute__((device)) cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
  cudaTextureObject_t __obj;
  asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
  return __obj;
 }

 template <class __op, class __T, class __HandleT, class... __Args>
 __attribute__((device)) static void __tex_fetch(__T* __ptr, __HandleT __handle,
                                                __Args... __args) {
  using __FT = typename __FT<__T>::__ft;
  *__ptr =
      __convert<__T, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>(
          __tex_handle_to_obj(__handle), __args...));
 }

 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
 __attribute__((device)) static void __tex_fetch(
    __DataT*, __RetT* __ptr,
    texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
    __Args... __args) {
  using __FT = typename __FT<__DataT>::__ft;
  *__ptr = __convert<__RetT, float4>::__run(
      __tex_fetch_v4<__op>::template __run<__FT>(__tex_handle_to_obj(__handle),
                                                 __args...));
 }

 template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
 __attribute__((device)) static void __tex_fetch(
    __DataT*, __RetT* __ptr,
    texture<__DataT, __TexT, cudaReadModeElementType> __handle,
    __Args... __args) {
  using __FT = typename __FT<__DataT>::__ft;
  *__ptr =
      __convert<__RetT, __FT>::__run(__tex_fetch_v4<__op>::template __run<__FT>(
          __tex_handle_to_obj(__handle), __args...));
 }
 }  // namespace