inferrna · November 29, 2016 13:14
diff --git a/test-device.cl b/test-device.cl
 struct class_Eigen__array {
    int f0[1];
 };
 struct Eigen__internal__scalar_sqrt_op {
    char f0;
 };
 struct Eigen__DSizes {
    struct class_Eigen__array f0;
 };
 struct Eigen__TensorEvaluator_3 {
    global float* f0;
    struct Eigen__DSizes f1;
    global struct Eigen__GpuDevice* f2;
    global struct class_Eigen__TensorMap_4* f3;
 };
 struct class_Eigen__TensorMap_4 {
    global float* f0;
    struct Eigen__DSizes f1;
    char f2[4];
 };
 struct Eigen__TensorEvaluator_0 {
    global float* f0;
    struct Eigen__DSizes f1;
    global struct Eigen__GpuDevice* f2;
    global struct class_Eigen__TensorMap* f3;
 };
 struct Eigen__TensorEvaluator_2 {
    struct Eigen__internal__scalar_sqrt_op f0;
    struct Eigen__TensorEvaluator_3 f1;
 };
 struct Eigen__GpuDevice {
    global struct class_Eigen__StreamInterface* f0;
    int f1;
    char f2[4];
 };
 struct class_Eigen__TensorMap {
    global float* f0;
    struct Eigen__DSizes f1;
    char f2[4];
 };
 struct class_Eigen__StreamInterface {
 };
 struct Eigen__TensorEvaluator {
    struct Eigen__TensorEvaluator_0 f0;
    struct Eigen__TensorEvaluator_2 f1;
 };
 struct class_Eigen__array_nopointers {
    int f0[1];
 };
 struct Eigen__DSizes_nopointers {
    struct class_Eigen__array_nopointers f0;
 };
 struct Eigen__TensorEvaluator_0_nopointers {
    struct Eigen__DSizes_nopointers f0;
 };
 struct Eigen__internal__scalar_sqrt_op_nopointers {
    char f0;
 };
 struct Eigen__TensorEvaluator_3_nopointers {
    struct Eigen__DSizes_nopointers f0;
 };
 struct Eigen__TensorEvaluator_2_nopointers {
    struct Eigen__internal__scalar_sqrt_op_nopointers f0;
    struct Eigen__TensorEvaluator_3_nopointers f1;
 };
 struct Eigen__TensorEvaluator_nopointers {
    struct Eigen__TensorEvaluator_0_nopointers f0;
    struct Eigen__TensorEvaluator_2_nopointers f1;
 };

 float _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch);
 float _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch);
 float _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(struct Eigen__internal__scalar_sqrt_op* this, float* a, local int *scratch);
 float4 _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(float4* a, local int *scratch);
 float4 _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch);
 float4 _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch);
 float4 _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(struct Eigen__internal__scalar_sqrt_op* this, float4* a, local int *scratch);
 global float* _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(struct Eigen__TensorEvaluator_0* this, int index, local int *scratch);
 kernel void _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_(global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, uint eval_ptr0_offset, global float* eval_ptr1, uint eval_ptr1_offset, int size, local int *scratch);
 void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch);
 void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch);
 void _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(struct Eigen__TensorEvaluator_0* this, int index, float4* x, local int *scratch);
 void _ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(global float* to, float4* from, local int *scratch);

 kernel void _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_(global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, uint eval_ptr0_offset, global float* eval_ptr1, uint eval_ptr1_offset, int size, local int *scratch) {
    eval_ptr1 += eval_ptr1_offset;
    eval_ptr0 += eval_ptr0_offset;
 struct Eigen__TensorEvaluator eval[1];
 eval[0].f0.f0 = 0;
 eval[0].f0.f1.f0.f0[0] = eval_nopointers[0].f0.f0.f0.f0[0];
 eval[0].f0.f2 = 0;
 eval[0].f0.f3 = 0;
 eval[0].f1.f0.f0 = eval_nopointers[0].f1.f0.f0;
 eval[0].f1.f1.f0 = 0;
 eval[0].f1.f1.f1.f0.f0[0] = eval_nopointers[0].f1.f1.f0.f0.f0[0];
 eval[0].f1.f1.f2 = 0;
 eval[0].f1.f1.f3 = 0;
 eval[0].f0.f0 = eval_ptr0;
 eval[0].f1.f1.f0 = eval_ptr1;

    char v20[1];
    int v10[1];
    int v11[1];
    int v12[1];
    int v13[1];
    int v14[1];
    int v15[1];
    int v16[1];
    int v17[1];
    int v18[1];
    int v19[1];
    int v22;
    int v23;
    int v25;
    int v26;
    int v28;
    int v29;
    int v30;
    int v34;
    int v35;
    int v36;
    int v43;
    int v45;
    int v47;
    int v48;
    int v50;
    int v51;
    int v53;
    int v54;
    int v57;
    int v59;
    int v60;
    int v61;
    int v63;
    int v64;
    int v65;
    int v67;
    int v68;
    int v71;
    int v73;
    int v74;
    int v75;
    int v9[1];
    struct Eigen__TensorEvaluator* v56;
    struct Eigen__TensorEvaluator* v70;
    struct Eigen__TensorEvaluator* v8[1];

 v1:;
    v17[0] = size;
    v22 = get_group_id(0);
    v23 = get_local_size(0);
    v25 = get_local_id(0);
    v26 = (v22 * v23) + v25;
    v18[0] = v26;
    v28 = get_local_size(0);
    v29 = get_num_groups(0);
    v30 = v28 * v29;
    v19[0] = v30;
    v20[0] = 1;
    v34 = v18[0];
    v35 = v17[0];
    v36 = v19[0];
    v8[0] = eval;
    v9[0] = v34;
    v10[0] = v35;
    v11[0] = v36;
    v12[0] = 4;
    v43 = v10[0];
    v45 = (v43 / 4) * 4;
    v13[0] = v45;
    v47 = v11[0];
    v48 = v47 * 4;
    v14[0] = v48;
    v50 = v9[0];
    v51 = v50 * 4;
    v15[0] = v51;
    goto v2;
 v2:;
    v53 = v15[0];
    v54 = v13[0];
    if (v53 < v54) {
        goto v3;
    } else {
        goto v4;
    }
 v3:;
    v56 = v8[0];
    v57 = v15[0];
    _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(v56, v57, scratch);
    v59 = v14[0];
    v60 = v15[0];
    v61 = v60 + v59;
    v15[0] = v61;
    goto v2;
 v4:;
    v63 = v13[0];
    v64 = v9[0];
    v65 = v63 + v64;
    v16[0] = v65;
    goto v5;
 v5:;
    v67 = v16[0];
    v68 = v10[0];
    if (v67 < v68) {
        goto v6;
    } else {
        goto v7;
    }
 v6:;
    v70 = v8[0];
    v71 = v16[0];
    _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(v70, v71, scratch);
    v73 = v11[0];
    v74 = v16[0];
    v75 = v74 + v73;
    v16[0] = v75;
    goto v5;
 v7:;
    return;
 }
 void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch) {
    float v20;
    float v23;
    float v26;
    float v29;
    float* v19;
    float* v22;
    float* v25;
    float* v28;
    float4 v18;
    float4 v6[1];
    int v15;
    int v17;
    int v3[1];
    int v4[1];
    int v5[1];
    struct Eigen__TensorEvaluator* v2[1];
    struct Eigen__TensorEvaluator* v9;

 v1:;
    v2[0] = this;
    v3[0] = i;
    v9 = v2[0];
    v4[0] = 16;
    v5[0] = 16;
    v15 = v3[0];
    v17 = v3[0];
    v18 = _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i((&(v9[0].f1)), v17, scratch);
    v19 = (&(((float*)&v6[0])[0]));
    v20 = ((float*)&v18)[0];
    v19[0] = v20;
    v22 = (&(((float*)&v6[0])[1]));
    v23 = ((float*)&v18)[1];
    v22[0] = v23;
    v25 = (&(((float*)&v6[0])[2]));
    v26 = ((float*)&v18)[2];
    v25[0] = v26;
    v28 = (&(((float*)&v6[0])[3]));
    v29 = ((float*)&v18)[3];
    v28[0] = v29;
    _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4((&(v9[0].f0)), v15, v6, scratch);
    return;
 }
 void _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(struct Eigen__TensorEvaluator_0* this, int index, float4* x, local int *scratch) {
    float4* v17;
    float4* v21;
    float4* v3[1];
    float4* v6[1];
    global float* v13;
    global float* v16;
    global float* v20;
    global float* v2[1];
    int v14;
    int v5[1];
    struct Eigen__TensorEvaluator_0* v10;
    struct Eigen__TensorEvaluator_0* v4[1];

 v1:;
    v4[0] = this;
    v5[0] = index;
    v6[0] = x;
    v10 = v4[0];
    v13 = (&(v10[0].f0))[0];
    v14 = v5[0];
    v16 = (&(v13[v14]));
    v17 = v6[0];
    v2[0] = v16;
    v3[0] = v17;
    v20 = v2[0];
    v21 = v3[0];
    _ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(v20, v21, scratch);
    return;
 }
 global float* _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(struct Eigen__TensorEvaluator_0* this, int index, local int *scratch) {
    global float* v11;
    int v3[1];
    int v7;
    struct Eigen__TensorEvaluator_0* v2[1];
    struct Eigen__TensorEvaluator_0* v6;

 v1:;
    v2[0] = this;
    v3[0] = index;
    v6 = v2[0];
    v7 = v3[0];
    v11 = (&(v6[0].f0))[0];
    return (&(v11[v7]));
 }
 float _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch) {
    float v12;
    float v14;
    float v4[1];
    int v11;
    int v3[1];
    struct Eigen__TensorEvaluator_2* v2[1];
    struct Eigen__TensorEvaluator_2* v7;

 v1:;
    v2[0] = this;
    v3[0] = index;
    v7 = v2[0];
    v11 = v3[0];
    v12 = _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi((&(v7[0].f1)), v11, scratch);
    v4[0] = v12;
    v14 = _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf((&(v7[0].f0)), v4, scratch);
    return v14;
 }
 float4 _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch) {
    float v15;
    float v18;
    float v21;
    float v24;
    float v28;
    float v31;
    float v34;
    float v37;
    float* v14;
    float* v17;
    float* v20;
    float* v23;
    float* v27;
    float* v30;
    float* v33;
    float* v36;
    float4 v13;
    float4 v26;
    float4 v2[1];
    float4 v39;
    float4 v5[1];
    int v12;
    int v4[1];
    struct Eigen__TensorEvaluator_2* v3[1];
    struct Eigen__TensorEvaluator_2* v8;

 v1:;
    v3[0] = this;
    v4[0] = index;
    v8 = v3[0];
    v12 = v4[0];
    v13 = _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i((&(v8[0].f1)), v12, scratch);
    v14 = (&(((float*)&v5[0])[0]));
    v15 = ((float*)&v13)[0];
    v14[0] = v15;
    v17 = (&(((float*)&v5[0])[1]));
    v18 = ((float*)&v13)[1];
    v17[0] = v18;
    v20 = (&(((float*)&v5[0])[2]));
    v21 = ((float*)&v13)[2];
    v20[0] = v21;
    v23 = (&(((float*)&v5[0])[3]));
    v24 = ((float*)&v13)[3];
    v23[0] = v24;
    v26 = _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_((&(v8[0].f0)), v5, scratch);
    v27 = (&(((float*)&v2[0])[0]));
    v28 = ((float*)&v26)[0];
    v27[0] = v28;
    v30 = (&(((float*)&v2[0])[1]));
    v31 = ((float*)&v26)[1];
    v30[0] = v31;
    v33 = (&(((float*)&v2[0])[2]));
    v34 = ((float*)&v26)[2];
    v33[0] = v34;
    v36 = (&(((float*)&v2[0])[3]));
    v37 = ((float*)&v26)[3];
    v36[0] = v37;
    v39 = v2[0];
    return v39;
 }
 void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch) {
    float v10;
    global float* v13;
    int v12;
    int v3[1];
    int v9;
    struct Eigen__TensorEvaluator* v2[1];
    struct Eigen__TensorEvaluator* v6;

 v1:;
    v2[0] = this;
    v3[0] = i;
    v6 = v2[0];
    v9 = v3[0];
    v10 = _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi((&(v6[0].f1)), v9, scratch);
    v12 = v3[0];
    v13 = _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi((&(v6[0].f0)), v12, scratch);
    v13[0] = v10;
    return;
 }
 void _ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(global float* to, float4* from, local int *scratch) {
    float4* v3[1];
    float4* v8;
    global float* v2[1];
    global float* v6;
    global float4* v7;

 v1:;
    v2[0] = to;
    v3[0] = from;
    v6 = v2[0];
    v7 = (global float4*)v6;
    v8 = v3[0];
    for(int __i=0; __i < 4; __i++) {;
        ((global int *)((global char*)v7))[__i] = (( int *)((char*)v8))[__i];
    }
 ;
    return;
 }
 float _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch) {
    float v16;
    global float* v10;
    global float* v13;
    global float* v15;
    global float* v2[1];
    int v11;
    int v4[1];
    struct Eigen__TensorEvaluator_3* v3[1];
    struct Eigen__TensorEvaluator_3* v7;

 v1:;
    v3[0] = this;
    v4[0] = index;
    v7 = v3[0];
    v10 = (&(v7[0].f0))[0];
    v11 = v4[0];
    v13 = (&(v10[v11]));
    v2[0] = v13;
    v15 = v2[0];
    v16 = v15[0];
    return v16;
 }
 float4 _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch) {
    float v18;
    float v22;
    float v26;
    float v30;
    float v33;
    float v36;
    float v39;
    float v42;
    float v46;
    float v49;
    float v52;
    float v55;
    float* v32;
    float* v35;
    float* v38;
    float* v41;
    float* v45;
    float* v48;
    float* v51;
    float* v54;
    float4 v2[1];
    float4 v31;
    float4 v44;
    float4 v4[1];
    float4 v57;
    global float* v12;
    global float* v15;
    global float* v17;
    global float* v19;
    global float* v23;
    global float* v27;
    global float* v3[1];
    int v13;
    int v6[1];
    struct Eigen__TensorEvaluator_3* v5[1];
    struct Eigen__TensorEvaluator_3* v9;

 v1:;
    v5[0] = this;
    v6[0] = index;
    v9 = v5[0];
    v12 = (&(v9[0].f0))[0];
    v13 = v6[0];
    v15 = (&(v12[v13]));
    v3[0] = v15;
    v17 = v3[0];
    v18 = v17[0];
    v19 = v3[0];
    v22 = (&(v19[1]))[0];
    v23 = v3[0];
    v26 = (&(v23[2]))[0];
    v27 = v3[0];
    v30 = (&(v27[3]))[0];
    v31 = (float4)(v18, v22, v26, v30);
    v32 = (&(((float*)&v2[0])[0]));
    v33 = ((float*)&v31)[0];
    v32[0] = v33;
    v35 = (&(((float*)&v2[0])[1]));
    v36 = ((float*)&v31)[1];
    v35[0] = v36;
    v38 = (&(((float*)&v2[0])[2]));
    v39 = ((float*)&v31)[2];
    v38[0] = v39;
    v41 = (&(((float*)&v2[0])[3]));
    v42 = ((float*)&v31)[3];
    v41[0] = v42;
    v44 = v2[0];
    v45 = (&(((float*)&v4[0])[0]));
    v46 = ((float*)&v44)[0];
    v45[0] = v46;
    v48 = (&(((float*)&v4[0])[1]));
    v49 = ((float*)&v44)[1];
    v48[0] = v49;
    v51 = (&(((float*)&v4[0])[2]));
    v52 = ((float*)&v44)[2];
    v51[0] = v52;
    v54 = (&(((float*)&v4[0])[3]));
    v55 = ((float*)&v44)[3];
    v54[0] = v55;
    v57 = v4[0];
    return v57;
 }
 float4 _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(struct Eigen__internal__scalar_sqrt_op* this, float4* a, local int *scratch) {
    float v12;
    float v15;
    float v18;
    float v21;
    float* v10;
    float* v14;
    float* v17;
    float* v20;
    float4 v23;
    float4 v2[1];
    float4 v9;
    float4* v4[1];
    float4* v8;
    struct Eigen__internal__scalar_sqrt_op* v3[1];
    struct Eigen__internal__scalar_sqrt_op* v7;

 v1:;
    v3[0] = this;
    v4[0] = a;
    v7 = v3[0];
    v8 = v4[0];
    v9 = _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(v8, scratch);
    v10 = (&(((float*)&v2[0])[0]));
    v12 = ((float*)&v9)[0];
    v10[0] = v12;
    v14 = (&(((float*)&v2[0])[1]));
    v15 = ((float*)&v9)[1];
    v14[0] = v15;
    v17 = (&(((float*)&v2[0])[2]));
    v18 = ((float*)&v9)[2];
    v17[0] = v18;
    v20 = (&(((float*)&v2[0])[3]));
    v21 = ((float*)&v9)[3];
    v20[0] = v21;
    v23 = v2[0];
    return v23;
 }
 float _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(struct Eigen__internal__scalar_sqrt_op* this, float* a, local int *scratch) {
    float v11;
    float v13;
    float* v10;
    float* v2[1];
    float* v4[1];
    float* v8;
    struct Eigen__internal__scalar_sqrt_op* v3[1];
    struct Eigen__internal__scalar_sqrt_op* v7;

 v1:;
    v3[0] = this;
    v4[0] = a;
    v7 = v3[0];
    v8 = v4[0];
    v2[0] = v8;
    v10 = v2[0];
    v11 = v10[0];
    v13 = sqrt(v11);
    return (float)v13;
 }
 float4 _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(float4* a, local int *scratch) {
    float v12;
    float v13;
    float v16;
    float v17;
    float v20;
    float v21;
    float v24;
    float v27;
    float v30;
    float v33;
    float v8;
    float v9;
    float* v23;
    float* v26;
    float* v29;
    float* v32;
    float4 v22;
    float4 v2[1];
    float4 v35;
    float4* v10;
    float4* v14;
    float4* v18;
    float4* v3[1];
    float4* v5;

 v1:;
    v3[0] = a;
    v5 = v3[0];
    v8 = (&(((float*)&v5[0])[0]))[0];
    v9 = sqrt(v8);
    v10 = v3[0];
    v12 = (&(((float*)&v10[0])[1]))[0];
    v13 = sqrt(v12);
    v14 = v3[0];
    v16 = (&(((float*)&v14[0])[2]))[0];
    v17 = sqrt(v16);
    v18 = v3[0];
    v20 = (&(((float*)&v18[0])[3]))[0];
    v21 = sqrt(v20);
    v22 = (float4)(v9, v13, v17, v21);
    v23 = (&(((float*)&v2[0])[0]));
    v24 = ((float*)&v22)[0];
    v23[0] = v24;
    v26 = (&(((float*)&v2[0])[1]));
    v27 = ((float*)&v22)[1];
    v26[0] = v27;
    v29 = (&(((float*)&v2[0])[2]));
    v30 = ((float*)&v22)[2];
    v29[0] = v30;
    v32 = (&(((float*)&v2[0])[3]));
    v33 = ((float*)&v22)[3];
    v32[0] = v33;
    v35 = v2[0];
    return v35;
 }
diff --git a/test-opt.ll b/test-opt.ll
 ; ModuleID = 'test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll'
 target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"

 %struct.__cuda_builtin_blockIdx_t = type { i8 }
 %struct.__cuda_builtin_blockDim_t = type { i8 }
 %struct.__cuda_builtin_threadIdx_t = type { i8 }
 %struct.__cuda_builtin_gridDim_t = type { i8 }
 %"struct.Eigen::TensorEvaluator" = type { %"struct.Eigen::TensorEvaluator.0", %"struct.Eigen::TensorEvaluator.2" }
 %"struct.Eigen::TensorEvaluator.0" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"*, %"class.Eigen::TensorMap"* }
 %"struct.Eigen::DSizes" = type { %"class.Eigen::array" }
 %"class.Eigen::array" = type { [1 x i32] }
 %"struct.Eigen::GpuDevice" = type <{ %"class.Eigen::StreamInterface"*, i32, [4 x i8] }>
 %"class.Eigen::StreamInterface" = type { i32 (...)** }
 %"class.Eigen::TensorMap" = type <{ float*, %"struct.Eigen::DSizes", [4 x i8] }>
 %"struct.Eigen::TensorEvaluator.2" = type { %"struct.Eigen::internal::scalar_sqrt_op", %"struct.Eigen::TensorEvaluator.3" }
 %"struct.Eigen::internal::scalar_sqrt_op" = type { i8 }
 %"struct.Eigen::TensorEvaluator.3" = type { float*, %"struct.Eigen::DSizes", %"struct.Eigen::GpuDevice"*, %"class.Eigen::TensorMap.4"* }
 %"class.Eigen::TensorMap.4" = type <{ float*, %"struct.Eigen::DSizes", [4 x i8] }>
 %struct.float4 = type { float, float, float, float }
 %"struct.Eigen::TensorEvaluator.7" = type { %"struct.Eigen::TensorEvaluator.0", %"struct.Eigen::TensorEvaluator.8" }
 %"struct.Eigen::TensorEvaluator.8" = type { %"struct.Eigen::internal::scalar_sqrt_gradient_op", %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.9" }
 %"struct.Eigen::internal::scalar_sqrt_gradient_op" = type { i8 }
 %"struct.Eigen::TensorEvaluator.9" = type { float*, %"struct.Eigen::DSizes.10", %"struct.Eigen::GpuDevice"*, %"class.Eigen::TensorMap.12"* }
 %"struct.Eigen::DSizes.10" = type { %"class.Eigen::array.11" }
 %"class.Eigen::array.11" = type { [1 x i64] }
 %"class.Eigen::TensorMap.12" = type { float*, %"struct.Eigen::DSizes.10" }

 $_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_ = comdat any

 $_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi = comdat any

 $_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi = comdat any

 $_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4 = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i = comdat any

 $_ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0_ = comdat any

 $_ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_ = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i = comdat any

 $_ZN5Eigen8internal5psqrtI6float4EET_RKS3_ = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi = comdat any

 $_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi = comdat any

 $_ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi = comdat any

 $_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElEEvT_T0_ = comdat any

 $_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEl = comdat any

 $_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEl = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE6packetILi16EEE6float4l = comdat any

 $_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfE8packetOpI6float4EEKT_RS6_S7_ = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4l = comdat any

 $_ZN5Eigen8internal5pset1I6float4EET_RKNS0_15unpacket_traitsIS3_E4typeE = comdat any

 $_ZN5Eigen8internal5pconjI6float4EET_RKS3_ = comdat any

 $_ZN5Eigen8internal4pdivI6float4EET_RKS3_S5_ = comdat any

 $_ZN5Eigen8internal4pmulI6float4EET_RKS3_S5_ = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE5coeffEl = comdat any

 $_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfEclERKfS4_ = comdat any

 $_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEl = comdat any

 $_ZN5Eigen6numext4conjIfEENS_8internal11conj_retvalINS2_36global_math_functions_filtering_baseIT_vE4typeEE4typeERKS5_ = comdat any

 $_ZN5Eigen8internal9conj_implIfLb0EE3runERKf = comdat any

 @.str = private unnamed_addr constant [5 x i8] c"NONE\00", align 1
 @blockIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_blockIdx_t, align 1
 @blockDim = extern_weak addrspace(1) global %struct.__cuda_builtin_blockDim_t, align 1
 @threadIdx = extern_weak addrspace(1) global %struct.__cuda_builtin_threadIdx_t, align 1
 @gridDim = extern_weak addrspace(1) global %struct.__cuda_builtin_gridDim_t, align 1
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i32 ()* @_ZL21__nvvm_reflect_anchorv to i8*)], section "llvm.metadata"

 ; Function Attrs: nounwind
 define internal i32 @_ZL21__nvvm_reflect_anchorv() #0 {
  %1 = call i32 @__nvvm_reflect(i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i32 0, i32 0)) #3
  ret i32 %1
 }

 ; Function Attrs: nounwind readnone
 declare i32 @__nvvm_reflect(i8*) #1

 define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_(%"struct.Eigen::TensorEvaluator"* byval align 8 %eval, i32 %size) #2 comdat {
  %1 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
  %2 = alloca i32, align 4
  %3 = alloca i32, align 4
  %4 = alloca i32, align 4
  %PacketSize.i = alloca i32, align 4
  %vectorized_size.i = alloca i32, align 4
  %vectorized_step_size.i = alloca i32, align 4
  %i.i = alloca i32, align 4
  %i1.i = alloca i32, align 4
  %5 = alloca i32, align 4
  %first_index = alloca i32, align 4
  %step_size = alloca i32, align 4
  %vectorizable = alloca i8, align 1
  store i32 %size, i32* %5, align 4
  %6 = call i32 @llvm.ptx.read.ctaid.x() #7
  %7 = call i32 @llvm.ptx.read.ntid.x() #7
  %8 = mul i32 %6, %7
  %9 = call i32 @llvm.ptx.read.tid.x() #7
  %10 = add i32 %8, %9
  store i32 %10, i32* %first_index, align 4
  %11 = call i32 @llvm.ptx.read.ntid.x() #7
  %12 = call i32 @llvm.ptx.read.nctaid.x() #7
  %13 = mul i32 %11, %12
  store i32 %13, i32* %step_size, align 4
  store i8 1, i8* %vectorizable, align 1
  %14 = load i32, i32* %first_index, align 4
  %15 = load i32, i32* %5, align 4
  %16 = load i32, i32* %step_size, align 4
  store %"struct.Eigen::TensorEvaluator"* %eval, %"struct.Eigen::TensorEvaluator"** %1, align 8
  store i32 %14, i32* %2, align 4
  store i32 %15, i32* %3, align 4
  store i32 %16, i32* %4, align 4
  store i32 4, i32* %PacketSize.i, align 4
  %17 = load i32, i32* %3, align 4
  %18 = sdiv i32 %17, 4
  %19 = mul nsw i32 %18, 4
  store i32 %19, i32* %vectorized_size.i, align 4
  %20 = load i32, i32* %4, align 4
  %21 = mul nsw i32 %20, 4
  store i32 %21, i32* %vectorized_step_size.i, align 4
  %22 = load i32, i32* %2, align 4
  %23 = mul nsw i32 %22, 4
  store i32 %23, i32* %i.i, align 4
  br label %24

 ; <label>:24                                      ; preds = %28, %0
  %25 = load i32, i32* %i.i, align 4
  %26 = load i32, i32* %vectorized_size.i, align 4
  %27 = icmp slt i32 %25, %26
  br i1 %27, label %28, label %34

 ; <label>:28                                      ; preds = %24
  %29 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %1, align 8
  %30 = load i32, i32* %i.i, align 4
  call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(%"struct.Eigen::TensorEvaluator"* %29, i32 %30)
  %31 = load i32, i32* %vectorized_step_size.i, align 4
  %32 = load i32, i32* %i.i, align 4
  %33 = add nsw i32 %32, %31
  store i32 %33, i32* %i.i, align 4
  br label %24

 ; <label>:34                                      ; preds = %24
  %35 = load i32, i32* %vectorized_size.i, align 4
  %36 = load i32, i32* %2, align 4
  %37 = add nsw i32 %35, %36
  store i32 %37, i32* %i1.i, align 4
  br label %38

 ; <label>:38                                      ; preds = %42, %34
  %39 = load i32, i32* %i1.i, align 4
  %40 = load i32, i32* %3, align 4
  %41 = icmp slt i32 %39, %40
  br i1 %41, label %42, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiLb1EE3runERSL_iii.exit

 ; <label>:42                                      ; preds = %38
  %43 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %1, align 8
  %44 = load i32, i32* %i1.i, align 4
  call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator"* %43, i32 %44)
  %45 = load i32, i32* %4, align 4
  %46 = load i32, i32* %i1.i, align 4
  %47 = add nsw i32 %46, %45
  store i32 %47, i32* %i1.i, align 4
  br label %38

 _ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiLb1EE3runERSL_iii.exit: ; preds = %38
  ret void
 }

 ; Function Attrs: nounwind readnone
 declare i32 @llvm.ptx.read.ctaid.x() #3

 ; Function Attrs: nounwind readnone
 declare i32 @llvm.ptx.read.ntid.x() #3

 ; Function Attrs: nounwind readnone
 declare i32 @llvm.ptx.read.tid.x() #3

 ; Function Attrs: nounwind readnone
 declare i32 @llvm.ptx.read.nctaid.x() #3

 ; Function Attrs: inlinehint
 define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(%"struct.Eigen::TensorEvaluator"* %this, i32 %i) #4 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
  %2 = alloca i32, align 4
  %LhsStoreMode = alloca i32, align 4
  %RhsLoadMode = alloca i32, align 4
  %3 = alloca %struct.float4, align 4
  store %"struct.Eigen::TensorEvaluator"* %this, %"struct.Eigen::TensorEvaluator"** %1, align 8
  store i32 %i, i32* %2, align 4
  %4 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %1, align 8
  store i32 16, i32* %LhsStoreMode, align 4
  store i32 16, i32* %RhsLoadMode, align 4
  %5 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %4, i32 0, i32 0
  %6 = load i32, i32* %2, align 4
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %4, i32 0, i32 1
  %8 = load i32, i32* %2, align 4
  %9 = call %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(%"struct.Eigen::TensorEvaluator.2"* %7, i32 %8)
  %10 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 0
  %11 = extractvalue %struct.float4 %9, 0
  store float %11, float* %10, align 4
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 1
  %13 = extractvalue %struct.float4 %9, 1
  store float %13, float* %12, align 4
  %14 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 2
  %15 = extractvalue %struct.float4 %9, 2
  store float %15, float* %14, align 4
  %16 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 3
  %17 = extractvalue %struct.float4 %9, 3
  store float %17, float* %16, align 4
  call void @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(%"struct.Eigen::TensorEvaluator.0"* %5, i32 %6, %struct.float4* dereferenceable(16) %3)
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(%"struct.Eigen::TensorEvaluator"* %this, i32 %i) #4 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator"*, align 8
  %2 = alloca i32, align 4
  store %"struct.Eigen::TensorEvaluator"* %this, %"struct.Eigen::TensorEvaluator"** %1, align 8
  store i32 %i, i32* %2, align 4
  %3 = load %"struct.Eigen::TensorEvaluator"*, %"struct.Eigen::TensorEvaluator"** %1, align 8
  %4 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %3, i32 0, i32 1
  %5 = load i32, i32* %2, align 4
  %6 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.2"* %4, i32 %5)
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator", %"struct.Eigen::TensorEvaluator"* %3, i32 0, i32 0
  %8 = load i32, i32* %2, align 4
  %9 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.0"* %7, i32 %8)
  store float %6, float* %9, align 4
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr void @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(%"struct.Eigen::TensorEvaluator.0"* %this, i32 %index, %struct.float4* dereferenceable(16) %x) #4 comdat align 2 {
  %1 = alloca float*, align 8
  %2 = alloca %struct.float4*, align 8
  %3 = alloca %"struct.Eigen::TensorEvaluator.0"*, align 8
  %4 = alloca i32, align 4
  %5 = alloca %struct.float4*, align 8
  store %"struct.Eigen::TensorEvaluator.0"* %this, %"struct.Eigen::TensorEvaluator.0"** %3, align 8
  store i32 %index, i32* %4, align 4
  store %struct.float4* %x, %struct.float4** %5, align 8
  %6 = load %"struct.Eigen::TensorEvaluator.0"*, %"struct.Eigen::TensorEvaluator.0"** %3, align 8
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.0", %"struct.Eigen::TensorEvaluator.0"* %6, i32 0, i32 0
  %8 = load float*, float** %7, align 8
  %9 = load i32, i32* %4, align 4
  %10 = sext i32 %9 to i64
  %11 = getelementptr inbounds float, float* %8, i64 %10
  %12 = load %struct.float4*, %struct.float4** %5, align 8
  store float* %11, float** %1, align 8
  store %struct.float4* %12, %struct.float4** %2, align 8
  %13 = load float*, float** %1, align 8
  %14 = load %struct.float4*, %struct.float4** %2, align 8
  call void @_ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0_(float* %13, %struct.float4* dereferenceable(16) %14)
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(%"struct.Eigen::TensorEvaluator.2"* %this, i32 %index) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %"struct.Eigen::TensorEvaluator.2"*, align 8
  %3 = alloca i32, align 4
  %4 = alloca %struct.float4, align 4
  store %"struct.Eigen::TensorEvaluator.2"* %this, %"struct.Eigen::TensorEvaluator.2"** %2, align 8
  store i32 %index, i32* %3, align 4
  %5 = load %"struct.Eigen::TensorEvaluator.2"*, %"struct.Eigen::TensorEvaluator.2"** %2, align 8
  %6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.2", %"struct.Eigen::TensorEvaluator.2"* %5, i32 0, i32 0
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.2", %"struct.Eigen::TensorEvaluator.2"* %5, i32 0, i32 1
  %8 = load i32, i32* %3, align 4
  %9 = call %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(%"struct.Eigen::TensorEvaluator.3"* %7, i32 %8)
  %10 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 0
  %11 = extractvalue %struct.float4 %9, 0
  store float %11, float* %10, align 4
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 1
  %13 = extractvalue %struct.float4 %9, 1
  store float %13, float* %12, align 4
  %14 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 2
  %15 = extractvalue %struct.float4 %9, 2
  store float %15, float* %14, align 4
  %16 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 3
  %17 = extractvalue %struct.float4 %9, 3
  store float %17, float* %16, align 4
  %18 = call %struct.float4 @_ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(%"struct.Eigen::internal::scalar_sqrt_op"* %6, %struct.float4* dereferenceable(16) %4)
  %19 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %20 = extractvalue %struct.float4 %18, 0
  store float %20, float* %19, align 4
  %21 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %22 = extractvalue %struct.float4 %18, 1
  store float %22, float* %21, align 4
  %23 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %24 = extractvalue %struct.float4 %18, 2
  store float %24, float* %23, align 4
  %25 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %26 = extractvalue %struct.float4 %18, 3
  store float %26, float* %25, align 4
  %27 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %27
 }

 ; Function Attrs: inlinehint nounwind
 define linkonce_odr void @_ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0_(float* %to, %struct.float4* dereferenceable(16) %from) #5 comdat {
  %1 = alloca float*, align 8
  %2 = alloca %struct.float4*, align 8
  store float* %to, float** %1, align 8
  store %struct.float4* %from, %struct.float4** %2, align 8
  %3 = load float*, float** %1, align 8
  %4 = bitcast float* %3 to %struct.float4*
  %5 = load %struct.float4*, %struct.float4** %2, align 8
  %6 = bitcast %struct.float4* %4 to i8*
  %7 = bitcast %struct.float4* %5 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* %7, i64 16, i32 4, i1 false)
  ret void
 }

 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #6

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(%"struct.Eigen::internal::scalar_sqrt_op"* %this, %struct.float4* dereferenceable(16) %a) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %"struct.Eigen::internal::scalar_sqrt_op"*, align 8
  %3 = alloca %struct.float4*, align 8
  store %"struct.Eigen::internal::scalar_sqrt_op"* %this, %"struct.Eigen::internal::scalar_sqrt_op"** %2, align 8
  store %struct.float4* %a, %struct.float4** %3, align 8
  %4 = load %"struct.Eigen::internal::scalar_sqrt_op"*, %"struct.Eigen::internal::scalar_sqrt_op"** %2, align 8
  %5 = load %struct.float4*, %struct.float4** %3, align 8
  %6 = call %struct.float4 @_ZN5Eigen8internal5psqrtI6float4EET_RKS3_(%struct.float4* dereferenceable(16) %5)
  %7 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %8 = extractvalue %struct.float4 %6, 0
  store float %8, float* %7, align 4
  %9 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %10 = extractvalue %struct.float4 %6, 1
  store float %10, float* %9, align 4
  %11 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %12 = extractvalue %struct.float4 %6, 2
  store float %12, float* %11, align 4
  %13 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %14 = extractvalue %struct.float4 %6, 3
  store float %14, float* %13, align 4
  %15 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %15
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(%"struct.Eigen::TensorEvaluator.3"* %this, i32 %index) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca float*, align 8
  %3 = alloca %struct.float4, align 4
  %4 = alloca %"struct.Eigen::TensorEvaluator.3"*, align 8
  %5 = alloca i32, align 4
  store %"struct.Eigen::TensorEvaluator.3"* %this, %"struct.Eigen::TensorEvaluator.3"** %4, align 8
  store i32 %index, i32* %5, align 4
  %6 = load %"struct.Eigen::TensorEvaluator.3"*, %"struct.Eigen::TensorEvaluator.3"** %4, align 8
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %6, i32 0, i32 0
  %8 = load float*, float** %7, align 8
  %9 = load i32, i32* %5, align 4
  %10 = sext i32 %9 to i64
  %11 = getelementptr inbounds float, float* %8, i64 %10
  store float* %11, float** %2, align 8
  %12 = load float*, float** %2, align 8
  %13 = load float, float* %12, align 4
  %14 = load float*, float** %2, align 8
  %15 = getelementptr inbounds float, float* %14, i64 1
  %16 = load float, float* %15, align 4
  %17 = load float*, float** %2, align 8
  %18 = getelementptr inbounds float, float* %17, i64 2
  %19 = load float, float* %18, align 4
  %20 = load float*, float** %2, align 8
  %21 = getelementptr inbounds float, float* %20, i64 3
  %22 = load float, float* %21, align 4
  %23 = call %struct.float4 @_Z11make_float4ffff(float %13, float %16, float %19, float %22)
  %24 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %25 = extractvalue %struct.float4 %23, 0
  store float %25, float* %24, align 4
  %26 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %27 = extractvalue %struct.float4 %23, 1
  store float %27, float* %26, align 4
  %28 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %29 = extractvalue %struct.float4 %23, 2
  store float %29, float* %28, align 4
  %30 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %31 = extractvalue %struct.float4 %23, 3
  store float %31, float* %30, align 4
  %32 = load %struct.float4, %struct.float4* %1, align 4
  %33 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 0
  %34 = extractvalue %struct.float4 %32, 0
  store float %34, float* %33, align 4
  %35 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 1
  %36 = extractvalue %struct.float4 %32, 1
  store float %36, float* %35, align 4
  %37 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 2
  %38 = extractvalue %struct.float4 %32, 2
  store float %38, float* %37, align 4
  %39 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 3
  %40 = extractvalue %struct.float4 %32, 3
  store float %40, float* %39, align 4
  %41 = load %struct.float4, %struct.float4* %3, align 4
  ret %struct.float4 %41
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZN5Eigen8internal5psqrtI6float4EET_RKS3_(%struct.float4* dereferenceable(16) %a) #4 comdat {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %struct.float4*, align 8
  store %struct.float4* %a, %struct.float4** %2, align 8
  %3 = load %struct.float4*, %struct.float4** %2, align 8
  %4 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 0
  %5 = load float, float* %4, align 4
  %6 = call float @sqrtf(float %5) #7
  %7 = load %struct.float4*, %struct.float4** %2, align 8
  %8 = getelementptr inbounds %struct.float4, %struct.float4* %7, i32 0, i32 1
  %9 = load float, float* %8, align 4
  %10 = call float @sqrtf(float %9) #7
  %11 = load %struct.float4*, %struct.float4** %2, align 8
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %11, i32 0, i32 2
  %13 = load float, float* %12, align 4
  %14 = call float @sqrtf(float %13) #7
  %15 = load %struct.float4*, %struct.float4** %2, align 8
  %16 = getelementptr inbounds %struct.float4, %struct.float4* %15, i32 0, i32 3
  %17 = load float, float* %16, align 4
  %18 = call float @sqrtf(float %17) #7
  %19 = call %struct.float4 @_Z11make_float4ffff(float %6, float %10, float %14, float %18)
  %20 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %21 = extractvalue %struct.float4 %19, 0
  store float %21, float* %20, align 4
  %22 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %23 = extractvalue %struct.float4 %19, 1
  store float %23, float* %22, align 4
  %24 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %25 = extractvalue %struct.float4 %19, 2
  store float %25, float* %24, align 4
  %26 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %27 = extractvalue %struct.float4 %19, 3
  store float %27, float* %26, align 4
  %28 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %28
 }

 declare %struct.float4 @_Z11make_float4ffff(float, float, float, float) #2

 ; Function Attrs: nounwind
 declare float @sqrtf(float) #0

 define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.2"* %this, i32 %index) #2 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator.2"*, align 8
  %2 = alloca i32, align 4
  %3 = alloca float, align 4
  store %"struct.Eigen::TensorEvaluator.2"* %this, %"struct.Eigen::TensorEvaluator.2"** %1, align 8
  store i32 %index, i32* %2, align 4
  %4 = load %"struct.Eigen::TensorEvaluator.2"*, %"struct.Eigen::TensorEvaluator.2"** %1, align 8
  %5 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.2", %"struct.Eigen::TensorEvaluator.2"* %4, i32 0, i32 0
  %6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.2", %"struct.Eigen::TensorEvaluator.2"* %4, i32 0, i32 1
  %7 = load i32, i32* %2, align 4
  %8 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %6, i32 %7)
  store float %8, float* %3, align 4
  %9 = call float @_ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(%"struct.Eigen::internal::scalar_sqrt_op"* %5, float* dereferenceable(4) %3)
  ret float %9
 }

 ; Function Attrs: inlinehint nounwind
 define linkonce_odr dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.0"* %this, i32 %index) #5 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator.0"*, align 8
  %2 = alloca i32, align 4
  store %"struct.Eigen::TensorEvaluator.0"* %this, %"struct.Eigen::TensorEvaluator.0"** %1, align 8
  store i32 %index, i32* %2, align 4
  %3 = load %"struct.Eigen::TensorEvaluator.0"*, %"struct.Eigen::TensorEvaluator.0"** %1, align 8
  %4 = load i32, i32* %2, align 4
  %5 = sext i32 %4 to i64
  %6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.0", %"struct.Eigen::TensorEvaluator.0"* %3, i32 0, i32 0
  %7 = load float*, float** %6, align 8
  %8 = getelementptr inbounds float, float* %7, i64 %5
  ret float* %8
 }

 ; Function Attrs: inlinehint
 define linkonce_odr float @_ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(%"struct.Eigen::internal::scalar_sqrt_op"* %this, float* dereferenceable(4) %a) #4 comdat align 2 {
  %1 = alloca float*, align 8
  %2 = alloca %"struct.Eigen::internal::scalar_sqrt_op"*, align 8
  %3 = alloca float*, align 8
  store %"struct.Eigen::internal::scalar_sqrt_op"* %this, %"struct.Eigen::internal::scalar_sqrt_op"** %2, align 8
  store float* %a, float** %3, align 8
  %4 = load %"struct.Eigen::internal::scalar_sqrt_op"*, %"struct.Eigen::internal::scalar_sqrt_op"** %2, align 8
  %5 = load float*, float** %3, align 8
  store float* %5, float** %1, align 8
  %6 = load float*, float** %1, align 8
  %7 = load float, float* %6, align 4
  %8 = fpext float %7 to double
  %9 = call double @sqrt(double %8) #7
  %10 = fptrunc double %9 to float
  ret float %10
 }

 ; Function Attrs: inlinehint
 define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %this, i32 %index) #4 comdat align 2 {
  %1 = alloca float*, align 8
  %2 = alloca %"struct.Eigen::TensorEvaluator.3"*, align 8
  %3 = alloca i32, align 4
  store %"struct.Eigen::TensorEvaluator.3"* %this, %"struct.Eigen::TensorEvaluator.3"** %2, align 8
  store i32 %index, i32* %3, align 4
  %4 = load %"struct.Eigen::TensorEvaluator.3"*, %"struct.Eigen::TensorEvaluator.3"** %2, align 8
  %5 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.3", %"struct.Eigen::TensorEvaluator.3"* %4, i32 0, i32 0
  %6 = load float*, float** %5, align 8
  %7 = load i32, i32* %3, align 4
  %8 = sext i32 %7 to i64
  %9 = getelementptr inbounds float, float* %6, i64 %8
  store float* %9, float** %1, align 8
  %10 = load float*, float** %1, align 8
  %11 = load float, float* %10, align 4
  ret float %11
 }

 ; Function Attrs: nounwind
 declare double @sqrt(double) #0

 define weak_odr void @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElEEvT_T0_(%"struct.Eigen::TensorEvaluator.7"* byval align 8 %eval, i64 %size) #2 comdat {
  %1 = alloca %"struct.Eigen::TensorEvaluator.7"*, align 8
  %2 = alloca i64, align 8
  %3 = alloca i64, align 8
  %4 = alloca i64, align 8
  %PacketSize.i = alloca i64, align 8
  %vectorized_size.i = alloca i64, align 8
  %vectorized_step_size.i = alloca i64, align 8
  %i.i = alloca i64, align 8
  %i1.i = alloca i64, align 8
  %5 = alloca i64, align 8
  %first_index = alloca i64, align 8
  %step_size = alloca i64, align 8
  %vectorizable = alloca i8, align 1
  store i64 %size, i64* %5, align 8
  %6 = call i32 @llvm.ptx.read.ctaid.x() #7
  %7 = call i32 @llvm.ptx.read.ntid.x() #7
  %8 = mul i32 %6, %7
  %9 = call i32 @llvm.ptx.read.tid.x() #7
  %10 = add i32 %8, %9
  %11 = zext i32 %10 to i64
  store i64 %11, i64* %first_index, align 8
  %12 = call i32 @llvm.ptx.read.ntid.x() #7
  %13 = call i32 @llvm.ptx.read.nctaid.x() #7
  %14 = mul i32 %12, %13
  %15 = zext i32 %14 to i64
  store i64 %15, i64* %step_size, align 8
  store i8 1, i8* %vectorizable, align 1
  %16 = load i64, i64* %first_index, align 8
  %17 = load i64, i64* %5, align 8
  %18 = load i64, i64* %step_size, align 8
  store %"struct.Eigen::TensorEvaluator.7"* %eval, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  store i64 %16, i64* %2, align 8
  store i64 %17, i64* %3, align 8
  store i64 %18, i64* %4, align 8
  store i64 4, i64* %PacketSize.i, align 8
  %19 = load i64, i64* %3, align 8
  %20 = sdiv i64 %19, 4
  %21 = mul nsw i64 %20, 4
  store i64 %21, i64* %vectorized_size.i, align 8
  %22 = load i64, i64* %4, align 8
  %23 = mul nsw i64 %22, 4
  store i64 %23, i64* %vectorized_step_size.i, align 8
  %24 = load i64, i64* %2, align 8
  %25 = mul nsw i64 %24, 4
  store i64 %25, i64* %i.i, align 8
  br label %26

 ; <label>:26                                      ; preds = %30, %0
  %27 = load i64, i64* %i.i, align 8
  %28 = load i64, i64* %vectorized_size.i, align 8
  %29 = icmp slt i64 %27, %28
  br i1 %29, label %30, label %36

 ; <label>:30                                      ; preds = %26
  %31 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  %32 = load i64, i64* %i.i, align 8
  call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEl(%"struct.Eigen::TensorEvaluator.7"* %31, i64 %32)
  %33 = load i64, i64* %vectorized_step_size.i, align 8
  %34 = load i64, i64* %i.i, align 8
  %35 = add nsw i64 %34, %33
  store i64 %35, i64* %i.i, align 8
  br label %26

 ; <label>:36                                      ; preds = %26
  %37 = load i64, i64* %vectorized_size.i, align 8
  %38 = load i64, i64* %2, align 8
  %39 = add nsw i64 %37, %38
  store i64 %39, i64* %i1.i, align 8
  br label %40

 ; <label>:40                                      ; preds = %44, %36
  %41 = load i64, i64* %i1.i, align 8
  %42 = load i64, i64* %3, align 8
  %43 = icmp slt i64 %41, %42
  br i1 %43, label %44, label %_ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElLb1EE3runERSO_lll.exit

 ; <label>:44                                      ; preds = %40
  %45 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  %46 = load i64, i64* %i1.i, align 8
  call void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEl(%"struct.Eigen::TensorEvaluator.7"* %45, i64 %46)
  %47 = load i64, i64* %4, align 8
  %48 = load i64, i64* %i1.i, align 8
  %49 = add nsw i64 %48, %47
  store i64 %49, i64* %i1.i, align 8
  br label %40

 _ZN5Eigen8internal19EigenMetaKernelEvalINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElLb1EE3runERSO_lll.exit: ; preds = %40
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEl(%"struct.Eigen::TensorEvaluator.7"* %this, i64 %i) #4 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator.7"*, align 8
  %2 = alloca i64, align 8
  %LhsStoreMode = alloca i32, align 4
  %RhsLoadMode = alloca i32, align 4
  %3 = alloca %struct.float4, align 4
  store %"struct.Eigen::TensorEvaluator.7"* %this, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  store i64 %i, i64* %2, align 8
  %4 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  store i32 16, i32* %LhsStoreMode, align 4
  store i32 16, i32* %RhsLoadMode, align 4
  %5 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i32 0, i32 0
  %6 = load i64, i64* %2, align 8
  %7 = trunc i64 %6 to i32
  %8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %4, i32 0, i32 1
  %9 = load i64, i64* %2, align 8
  %10 = call %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE6packetILi16EEE6float4l(%"struct.Eigen::TensorEvaluator.8"* %8, i64 %9)
  %11 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 0
  %12 = extractvalue %struct.float4 %10, 0
  store float %12, float* %11, align 4
  %13 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 1
  %14 = extractvalue %struct.float4 %10, 1
  store float %14, float* %13, align 4
  %15 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 2
  %16 = extractvalue %struct.float4 %10, 2
  store float %16, float* %15, align 4
  %17 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 3
  %18 = extractvalue %struct.float4 %10, 3
  store float %18, float* %17, align 4
  call void @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(%"struct.Eigen::TensorEvaluator.0"* %5, i32 %7, %struct.float4* dereferenceable(16) %3)
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr void @_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEKNS2_INS3_ISB_Li1ELi1ElEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEl(%"struct.Eigen::TensorEvaluator.7"* %this, i64 %i) #4 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator.7"*, align 8
  %2 = alloca i64, align 8
  store %"struct.Eigen::TensorEvaluator.7"* %this, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  store i64 %i, i64* %2, align 8
  %3 = load %"struct.Eigen::TensorEvaluator.7"*, %"struct.Eigen::TensorEvaluator.7"** %1, align 8
  %4 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %3, i32 0, i32 1
  %5 = load i64, i64* %2, align 8
  %6 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE5coeffEl(%"struct.Eigen::TensorEvaluator.8"* %4, i64 %5)
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.7", %"struct.Eigen::TensorEvaluator.7"* %3, i32 0, i32 0
  %8 = load i64, i64* %2, align 8
  %9 = trunc i64 %8 to i32
  %10 = call dereferenceable(4) float* @_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(%"struct.Eigen::TensorEvaluator.0"* %7, i32 %9)
  store float %6, float* %10, align 4
  ret void
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE6packetILi16EEE6float4l(%"struct.Eigen::TensorEvaluator.8"* %this, i64 %index) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %"struct.Eigen::TensorEvaluator.8"*, align 8
  %3 = alloca i64, align 8
  %4 = alloca %struct.float4, align 4
  %5 = alloca %struct.float4, align 4
  store %"struct.Eigen::TensorEvaluator.8"* %this, %"struct.Eigen::TensorEvaluator.8"** %2, align 8
  store i64 %index, i64* %3, align 8
  %6 = load %"struct.Eigen::TensorEvaluator.8"*, %"struct.Eigen::TensorEvaluator.8"** %2, align 8
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %6, i32 0, i32 0
  %8 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %6, i32 0, i32 1
  %9 = load i64, i64* %3, align 8
  %10 = trunc i64 %9 to i32
  %11 = call %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(%"struct.Eigen::TensorEvaluator.3"* %8, i32 %10)
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 0
  %13 = extractvalue %struct.float4 %11, 0
  store float %13, float* %12, align 4
  %14 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 1
  %15 = extractvalue %struct.float4 %11, 1
  store float %15, float* %14, align 4
  %16 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 2
  %17 = extractvalue %struct.float4 %11, 2
  store float %17, float* %16, align 4
  %18 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 3
  %19 = extractvalue %struct.float4 %11, 3
  store float %19, float* %18, align 4
  %20 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %6, i32 0, i32 2
  %21 = load i64, i64* %3, align 8
  %22 = call %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4l(%"struct.Eigen::TensorEvaluator.9"* %20, i64 %21)
  %23 = getelementptr inbounds %struct.float4, %struct.float4* %5, i32 0, i32 0
  %24 = extractvalue %struct.float4 %22, 0
  store float %24, float* %23, align 4
  %25 = getelementptr inbounds %struct.float4, %struct.float4* %5, i32 0, i32 1
  %26 = extractvalue %struct.float4 %22, 1
  store float %26, float* %25, align 4
  %27 = getelementptr inbounds %struct.float4, %struct.float4* %5, i32 0, i32 2
  %28 = extractvalue %struct.float4 %22, 2
  store float %28, float* %27, align 4
  %29 = getelementptr inbounds %struct.float4, %struct.float4* %5, i32 0, i32 3
  %30 = extractvalue %struct.float4 %22, 3
  store float %30, float* %29, align 4
  %31 = call %struct.float4 @_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfE8packetOpI6float4EEKT_RS6_S7_(%"struct.Eigen::internal::scalar_sqrt_gradient_op"* %7, %struct.float4* dereferenceable(16) %4, %struct.float4* dereferenceable(16) %5)
  %32 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %33 = extractvalue %struct.float4 %31, 0
  store float %33, float* %32, align 4
  %34 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %35 = extractvalue %struct.float4 %31, 1
  store float %35, float* %34, align 4
  %36 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %37 = extractvalue %struct.float4 %31, 2
  store float %37, float* %36, align 4
  %38 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %39 = extractvalue %struct.float4 %31, 3
  store float %39, float* %38, align 4
  %40 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %40
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfE8packetOpI6float4EEKT_RS6_S7_(%"struct.Eigen::internal::scalar_sqrt_gradient_op"* %this, %struct.float4* dereferenceable(16) %output, %struct.float4* dereferenceable(16) %output_gradient) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %"struct.Eigen::internal::scalar_sqrt_gradient_op"*, align 8
  %3 = alloca %struct.float4*, align 8
  %4 = alloca %struct.float4*, align 8
  %const_half = alloca %struct.float4, align 4
  %5 = alloca float, align 4
  %out_conj = alloca %struct.float4, align 4
  %6 = alloca %struct.float4, align 4
  store %"struct.Eigen::internal::scalar_sqrt_gradient_op"* %this, %"struct.Eigen::internal::scalar_sqrt_gradient_op"** %2, align 8
  store %struct.float4* %output, %struct.float4** %3, align 8
  store %struct.float4* %output_gradient, %struct.float4** %4, align 8
  %7 = load %"struct.Eigen::internal::scalar_sqrt_gradient_op"*, %"struct.Eigen::internal::scalar_sqrt_gradient_op"** %2, align 8
  store float 5.000000e-01, float* %5, align 4
  %8 = call %struct.float4 @_ZN5Eigen8internal5pset1I6float4EET_RKNS0_15unpacket_traitsIS3_E4typeE(float* dereferenceable(4) %5)
  %9 = getelementptr inbounds %struct.float4, %struct.float4* %const_half, i32 0, i32 0
  %10 = extractvalue %struct.float4 %8, 0
  store float %10, float* %9, align 4
  %11 = getelementptr inbounds %struct.float4, %struct.float4* %const_half, i32 0, i32 1
  %12 = extractvalue %struct.float4 %8, 1
  store float %12, float* %11, align 4
  %13 = getelementptr inbounds %struct.float4, %struct.float4* %const_half, i32 0, i32 2
  %14 = extractvalue %struct.float4 %8, 2
  store float %14, float* %13, align 4
  %15 = getelementptr inbounds %struct.float4, %struct.float4* %const_half, i32 0, i32 3
  %16 = extractvalue %struct.float4 %8, 3
  store float %16, float* %15, align 4
  %17 = load %struct.float4*, %struct.float4** %3, align 8
  %18 = call %struct.float4 @_ZN5Eigen8internal5pconjI6float4EET_RKS3_(%struct.float4* dereferenceable(16) %17)
  %19 = getelementptr inbounds %struct.float4, %struct.float4* %out_conj, i32 0, i32 0
  %20 = extractvalue %struct.float4 %18, 0
  store float %20, float* %19, align 4
  %21 = getelementptr inbounds %struct.float4, %struct.float4* %out_conj, i32 0, i32 1
  %22 = extractvalue %struct.float4 %18, 1
  store float %22, float* %21, align 4
  %23 = getelementptr inbounds %struct.float4, %struct.float4* %out_conj, i32 0, i32 2
  %24 = extractvalue %struct.float4 %18, 2
  store float %24, float* %23, align 4
  %25 = getelementptr inbounds %struct.float4, %struct.float4* %out_conj, i32 0, i32 3
  %26 = extractvalue %struct.float4 %18, 3
  store float %26, float* %25, align 4
  %27 = load %struct.float4*, %struct.float4** %4, align 8
  %28 = call %struct.float4 @_ZN5Eigen8internal4pmulI6float4EET_RKS3_S5_(%struct.float4* dereferenceable(16) %const_half, %struct.float4* dereferenceable(16) %27)
  %29 = getelementptr inbounds %struct.float4, %struct.float4* %6, i32 0, i32 0
  %30 = extractvalue %struct.float4 %28, 0
  store float %30, float* %29, align 4
  %31 = getelementptr inbounds %struct.float4, %struct.float4* %6, i32 0, i32 1
  %32 = extractvalue %struct.float4 %28, 1
  store float %32, float* %31, align 4
  %33 = getelementptr inbounds %struct.float4, %struct.float4* %6, i32 0, i32 2
  %34 = extractvalue %struct.float4 %28, 2
  store float %34, float* %33, align 4
  %35 = getelementptr inbounds %struct.float4, %struct.float4* %6, i32 0, i32 3
  %36 = extractvalue %struct.float4 %28, 3
  store float %36, float* %35, align 4
  %37 = call %struct.float4 @_ZN5Eigen8internal4pdivI6float4EET_RKS3_S5_(%struct.float4* dereferenceable(16) %6, %struct.float4* dereferenceable(16) %out_conj)
  %38 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %39 = extractvalue %struct.float4 %37, 0
  store float %39, float* %38, align 4
  %40 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %41 = extractvalue %struct.float4 %37, 1
  store float %41, float* %40, align 4
  %42 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %43 = extractvalue %struct.float4 %37, 2
  store float %43, float* %42, align 4
  %44 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %45 = extractvalue %struct.float4 %37, 3
  store float %45, float* %44, align 4
  %46 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %46
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4l(%"struct.Eigen::TensorEvaluator.9"* %this, i64 %index) #4 comdat align 2 {
  %1 = alloca %struct.float4, align 4
  %2 = alloca float*, align 8
  %3 = alloca %struct.float4, align 4
  %4 = alloca %"struct.Eigen::TensorEvaluator.9"*, align 8
  %5 = alloca i64, align 8
  store %"struct.Eigen::TensorEvaluator.9"* %this, %"struct.Eigen::TensorEvaluator.9"** %4, align 8
  store i64 %index, i64* %5, align 8
  %6 = load %"struct.Eigen::TensorEvaluator.9"*, %"struct.Eigen::TensorEvaluator.9"** %4, align 8
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.9", %"struct.Eigen::TensorEvaluator.9"* %6, i32 0, i32 0
  %8 = load float*, float** %7, align 8
  %9 = load i64, i64* %5, align 8
  %10 = getelementptr inbounds float, float* %8, i64 %9
  store float* %10, float** %2, align 8
  %11 = load float*, float** %2, align 8
  %12 = load float, float* %11, align 4
  %13 = load float*, float** %2, align 8
  %14 = getelementptr inbounds float, float* %13, i64 1
  %15 = load float, float* %14, align 4
  %16 = load float*, float** %2, align 8
  %17 = getelementptr inbounds float, float* %16, i64 2
  %18 = load float, float* %17, align 4
  %19 = load float*, float** %2, align 8
  %20 = getelementptr inbounds float, float* %19, i64 3
  %21 = load float, float* %20, align 4
  %22 = call %struct.float4 @_Z11make_float4ffff(float %12, float %15, float %18, float %21)
  %23 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %24 = extractvalue %struct.float4 %22, 0
  store float %24, float* %23, align 4
  %25 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %26 = extractvalue %struct.float4 %22, 1
  store float %26, float* %25, align 4
  %27 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %28 = extractvalue %struct.float4 %22, 2
  store float %28, float* %27, align 4
  %29 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %30 = extractvalue %struct.float4 %22, 3
  store float %30, float* %29, align 4
  %31 = load %struct.float4, %struct.float4* %1, align 4
  %32 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 0
  %33 = extractvalue %struct.float4 %31, 0
  store float %33, float* %32, align 4
  %34 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 1
  %35 = extractvalue %struct.float4 %31, 1
  store float %35, float* %34, align 4
  %36 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 2
  %37 = extractvalue %struct.float4 %31, 2
  store float %37, float* %36, align 4
  %38 = getelementptr inbounds %struct.float4, %struct.float4* %3, i32 0, i32 3
  %39 = extractvalue %struct.float4 %31, 3
  store float %39, float* %38, align 4
  %40 = load %struct.float4, %struct.float4* %3, align 4
  ret %struct.float4 %40
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZN5Eigen8internal5pset1I6float4EET_RKNS0_15unpacket_traitsIS3_E4typeE(float* dereferenceable(4) %from) #4 comdat {
  %1 = alloca %struct.float4, align 4
  %2 = alloca float*, align 8
  store float* %from, float** %2, align 8
  %3 = load float*, float** %2, align 8
  %4 = load float, float* %3, align 4
  %5 = load float*, float** %2, align 8
  %6 = load float, float* %5, align 4
  %7 = load float*, float** %2, align 8
  %8 = load float, float* %7, align 4
  %9 = load float*, float** %2, align 8
  %10 = load float, float* %9, align 4
  %11 = call %struct.float4 @_Z11make_float4ffff(float %4, float %6, float %8, float %10)
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %13 = extractvalue %struct.float4 %11, 0
  store float %13, float* %12, align 4
  %14 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %15 = extractvalue %struct.float4 %11, 1
  store float %15, float* %14, align 4
  %16 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %17 = extractvalue %struct.float4 %11, 2
  store float %17, float* %16, align 4
  %18 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %19 = extractvalue %struct.float4 %11, 3
  store float %19, float* %18, align 4
  %20 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %20
 }

 ; Function Attrs: inlinehint nounwind
 define linkonce_odr %struct.float4 @_ZN5Eigen8internal5pconjI6float4EET_RKS3_(%struct.float4* dereferenceable(16) %a) #5 comdat {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %struct.float4*, align 8
  store %struct.float4* %a, %struct.float4** %2, align 8
  %3 = load %struct.float4*, %struct.float4** %2, align 8
  %4 = bitcast %struct.float4* %1 to i8*
  %5 = bitcast %struct.float4* %3 to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 16, i32 4, i1 false)
  %6 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %6
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZN5Eigen8internal4pdivI6float4EET_RKS3_S5_(%struct.float4* dereferenceable(16) %a, %struct.float4* dereferenceable(16) %b) #4 comdat {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %struct.float4*, align 8
  %3 = alloca %struct.float4*, align 8
  store %struct.float4* %a, %struct.float4** %2, align 8
  store %struct.float4* %b, %struct.float4** %3, align 8
  %4 = load %struct.float4*, %struct.float4** %2, align 8
  %5 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 0
  %6 = load float, float* %5, align 4
  %7 = load %struct.float4*, %struct.float4** %3, align 8
  %8 = getelementptr inbounds %struct.float4, %struct.float4* %7, i32 0, i32 0
  %9 = load float, float* %8, align 4
  %10 = fdiv float %6, %9
  %11 = load %struct.float4*, %struct.float4** %2, align 8
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %11, i32 0, i32 1
  %13 = load float, float* %12, align 4
  %14 = load %struct.float4*, %struct.float4** %3, align 8
  %15 = getelementptr inbounds %struct.float4, %struct.float4* %14, i32 0, i32 1
  %16 = load float, float* %15, align 4
  %17 = fdiv float %13, %16
  %18 = load %struct.float4*, %struct.float4** %2, align 8
  %19 = getelementptr inbounds %struct.float4, %struct.float4* %18, i32 0, i32 2
  %20 = load float, float* %19, align 4
  %21 = load %struct.float4*, %struct.float4** %3, align 8
  %22 = getelementptr inbounds %struct.float4, %struct.float4* %21, i32 0, i32 2
  %23 = load float, float* %22, align 4
  %24 = fdiv float %20, %23
  %25 = load %struct.float4*, %struct.float4** %2, align 8
  %26 = getelementptr inbounds %struct.float4, %struct.float4* %25, i32 0, i32 3
  %27 = load float, float* %26, align 4
  %28 = load %struct.float4*, %struct.float4** %3, align 8
  %29 = getelementptr inbounds %struct.float4, %struct.float4* %28, i32 0, i32 3
  %30 = load float, float* %29, align 4
  %31 = fdiv float %27, %30
  %32 = call %struct.float4 @_Z11make_float4ffff(float %10, float %17, float %24, float %31)
  %33 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %34 = extractvalue %struct.float4 %32, 0
  store float %34, float* %33, align 4
  %35 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %36 = extractvalue %struct.float4 %32, 1
  store float %36, float* %35, align 4
  %37 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %38 = extractvalue %struct.float4 %32, 2
  store float %38, float* %37, align 4
  %39 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %40 = extractvalue %struct.float4 %32, 3
  store float %40, float* %39, align 4
  %41 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %41
 }

 ; Function Attrs: inlinehint
 define linkonce_odr %struct.float4 @_ZN5Eigen8internal4pmulI6float4EET_RKS3_S5_(%struct.float4* dereferenceable(16) %a, %struct.float4* dereferenceable(16) %b) #4 comdat {
  %1 = alloca %struct.float4, align 4
  %2 = alloca %struct.float4*, align 8
  %3 = alloca %struct.float4*, align 8
  store %struct.float4* %a, %struct.float4** %2, align 8
  store %struct.float4* %b, %struct.float4** %3, align 8
  %4 = load %struct.float4*, %struct.float4** %2, align 8
  %5 = getelementptr inbounds %struct.float4, %struct.float4* %4, i32 0, i32 0
  %6 = load float, float* %5, align 4
  %7 = load %struct.float4*, %struct.float4** %3, align 8
  %8 = getelementptr inbounds %struct.float4, %struct.float4* %7, i32 0, i32 0
  %9 = load float, float* %8, align 4
  %10 = fmul float %6, %9
  %11 = load %struct.float4*, %struct.float4** %2, align 8
  %12 = getelementptr inbounds %struct.float4, %struct.float4* %11, i32 0, i32 1
  %13 = load float, float* %12, align 4
  %14 = load %struct.float4*, %struct.float4** %3, align 8
  %15 = getelementptr inbounds %struct.float4, %struct.float4* %14, i32 0, i32 1
  %16 = load float, float* %15, align 4
  %17 = fmul float %13, %16
  %18 = load %struct.float4*, %struct.float4** %2, align 8
  %19 = getelementptr inbounds %struct.float4, %struct.float4* %18, i32 0, i32 2
  %20 = load float, float* %19, align 4
  %21 = load %struct.float4*, %struct.float4** %3, align 8
  %22 = getelementptr inbounds %struct.float4, %struct.float4* %21, i32 0, i32 2
  %23 = load float, float* %22, align 4
  %24 = fmul float %20, %23
  %25 = load %struct.float4*, %struct.float4** %2, align 8
  %26 = getelementptr inbounds %struct.float4, %struct.float4* %25, i32 0, i32 3
  %27 = load float, float* %26, align 4
  %28 = load %struct.float4*, %struct.float4** %3, align 8
  %29 = getelementptr inbounds %struct.float4, %struct.float4* %28, i32 0, i32 3
  %30 = load float, float* %29, align 4
  %31 = fmul float %27, %30
  %32 = call %struct.float4 @_Z11make_float4ffff(float %10, float %17, float %24, float %31)
  %33 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 0
  %34 = extractvalue %struct.float4 %32, 0
  store float %34, float* %33, align 4
  %35 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 1
  %36 = extractvalue %struct.float4 %32, 1
  store float %36, float* %35, align 4
  %37 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 2
  %38 = extractvalue %struct.float4 %32, 2
  store float %38, float* %37, align 4
  %39 = getelementptr inbounds %struct.float4, %struct.float4* %1, i32 0, i32 3
  %40 = extractvalue %struct.float4 %32, 3
  store float %40, float* %39, align 4
  %41 = load %struct.float4, %struct.float4* %1, align 4
  ret %struct.float4 %41
 }

 define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_19TensorCwiseBinaryOpINS_8internal23scalar_sqrt_gradient_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS5_INS6_IS7_Li1ELi1ElEELi16ES9_EEEENS_9GpuDeviceEE5coeffEl(%"struct.Eigen::TensorEvaluator.8"* %this, i64 %index) #2 comdat align 2 {
  %1 = alloca %"struct.Eigen::TensorEvaluator.8"*, align 8
  %2 = alloca i64, align 8
  %3 = alloca float, align 4
  %4 = alloca float, align 4
  store %"struct.Eigen::TensorEvaluator.8"* %this, %"struct.Eigen::TensorEvaluator.8"** %1, align 8
  store i64 %index, i64* %2, align 8
  %5 = load %"struct.Eigen::TensorEvaluator.8"*, %"struct.Eigen::TensorEvaluator.8"** %1, align 8
  %6 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %5, i32 0, i32 0
  %7 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %5, i32 0, i32 1
  %8 = load i64, i64* %2, align 8
  %9 = trunc i64 %8 to i32
  %10 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(%"struct.Eigen::TensorEvaluator.3"* %7, i32 %9)
  store float %10, float* %3, align 4
  %11 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.8", %"struct.Eigen::TensorEvaluator.8"* %5, i32 0, i32 2
  %12 = load i64, i64* %2, align 8
  %13 = call float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEl(%"struct.Eigen::TensorEvaluator.9"* %11, i64 %12)
  store float %13, float* %4, align 4
  %14 = call float @_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfEclERKfS4_(%"struct.Eigen::internal::scalar_sqrt_gradient_op"* %6, float* dereferenceable(4) %3, float* dereferenceable(4) %4)
  ret float %14
 }

 ; Function Attrs: inlinehint
 define linkonce_odr float @_ZNK5Eigen8internal23scalar_sqrt_gradient_opIfEclERKfS4_(%"struct.Eigen::internal::scalar_sqrt_gradient_op"* %this, float* dereferenceable(4) %output, float* dereferenceable(4) %output_gradient) #4 comdat align 2 {
  %1 = alloca %"struct.Eigen::internal::scalar_sqrt_gradient_op"*, align 8
  %2 = alloca float*, align 8
  %3 = alloca float*, align 8
  %out_conj = alloca float, align 4
  store %"struct.Eigen::internal::scalar_sqrt_gradient_op"* %this, %"struct.Eigen::internal::scalar_sqrt_gradient_op"** %1, align 8
  store float* %output, float** %2, align 8
  store float* %output_gradient, float** %3, align 8
  %4 = load %"struct.Eigen::internal::scalar_sqrt_gradient_op"*, %"struct.Eigen::internal::scalar_sqrt_gradient_op"** %1, align 8
  %5 = load float*, float** %2, align 8
  %6 = call float @_ZN5Eigen6numext4conjIfEENS_8internal11conj_retvalINS2_36global_math_functions_filtering_baseIT_vE4typeEE4typeERKS5_(float* dereferenceable(4) %5)
  store float %6, float* %out_conj, align 4
  %7 = load float*, float** %3, align 8
  %8 = load float, float* %7, align 4
  %9 = fmul float 5.000000e-01, %8
  %10 = load float, float* %out_conj, align 4
  %11 = fdiv float %9, %10
  ret float %11
 }

 ; Function Attrs: inlinehint nounwind
 define linkonce_odr float @_ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1ElEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEl(%"struct.Eigen::TensorEvaluator.9"* %this, i64 %index) #5 comdat align 2 {
  %1 = alloca float*, align 8
  %2 = alloca %"struct.Eigen::TensorEvaluator.9"*, align 8
  %3 = alloca i64, align 8
  store %"struct.Eigen::TensorEvaluator.9"* %this, %"struct.Eigen::TensorEvaluator.9"** %2, align 8
  store i64 %index, i64* %3, align 8
  %4 = load %"struct.Eigen::TensorEvaluator.9"*, %"struct.Eigen::TensorEvaluator.9"** %2, align 8
  %5 = getelementptr inbounds %"struct.Eigen::TensorEvaluator.9", %"struct.Eigen::TensorEvaluator.9"* %4, i32 0, i32 0
  %6 = load float*, float** %5, align 8
  %7 = load i64, i64* %3, align 8
  %8 = getelementptr inbounds float, float* %6, i64 %7
  store float* %8, float** %1, align 8
  %9 = load float*, float** %1, align 8
  %10 = load float, float* %9, align 4
  ret float %10
 }

 ; Function Attrs: inlinehint
 define linkonce_odr float @_ZN5Eigen6numext4conjIfEENS_8internal11conj_retvalINS2_36global_math_functions_filtering_baseIT_vE4typeEE4typeERKS5_(float* dereferenceable(4) %x) #4 comdat {
  %1 = alloca float*, align 8
  store float* %x, float** %1, align 8
  %2 = load float*, float** %1, align 8
  %3 = call float @_ZN5Eigen8internal9conj_implIfLb0EE3runERKf(float* dereferenceable(4) %2)
  ret float %3
 }

 ; Function Attrs: inlinehint nounwind
 define linkonce_odr float @_ZN5Eigen8internal9conj_implIfLb0EE3runERKf(float* dereferenceable(4) %x) #5 comdat align 2 {
  %1 = alloca float*, align 8
  store float* %x, float** %1, align 8
  %2 = load float*, float** %1, align 8
  %3 = load float, float* %2, align 4
  ret float %3
 }

 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { inlinehint "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #5 = { inlinehint nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_30" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #6 = { argmemonly nounwind }
 attributes #7 = { nounwind }

 !nvvm.annotations = !{!0, !1, !2, !3}
 !llvm.module.flags = !{!4}
 !llvm.ident = !{!5}

 !0 = !{void (%"struct.Eigen::TensorEvaluator"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"kernel", i32 1}
 !1 = !{void (%"struct.Eigen::TensorEvaluator"*, i32)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_, !"maxntidx", i32 1024}
 !2 = !{void (%"struct.Eigen::TensorEvaluator.7"*, i64)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElEEvT_T0_, !"kernel", i32 1}
 !3 = !{void (%"struct.Eigen::TensorEvaluator.7"*, i64)* @_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_19TensorCwiseBinaryOpINS0_23scalar_sqrt_gradient_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEKNS4_INS5_ISC_Li1ELi1ElEELi16ES7_EEEEEENS_9GpuDeviceEEElEEvT_T0_, !"maxntidx", i32 1024}
 !4 = !{i32 1, !"PIC Level", i32 2}
 !5 = !{!"clang version 3.8.0-2ubuntu4 (tags/RELEASE_380/final)"}
	struct class_Eigen__array {
	int f0[1];
	};
	struct Eigen__internal__scalar_sqrt_op {
	char f0;
	};
	struct Eigen__DSizes {
	struct class_Eigen__array f0;
	};
	struct Eigen__TensorEvaluator_3 {
	global float* f0;
	struct Eigen__DSizes f1;
	global struct Eigen__GpuDevice* f2;
	global struct class_Eigen__TensorMap_4* f3;
	};
	struct class_Eigen__TensorMap_4 {
	global float* f0;
	struct Eigen__DSizes f1;
	char f2[4];
	};
	struct Eigen__TensorEvaluator_0 {
	global float* f0;
	struct Eigen__DSizes f1;
	global struct Eigen__GpuDevice* f2;
	global struct class_Eigen__TensorMap* f3;
	};
	struct Eigen__TensorEvaluator_2 {
	struct Eigen__internal__scalar_sqrt_op f0;
	struct Eigen__TensorEvaluator_3 f1;
	};
	struct Eigen__GpuDevice {
	global struct class_Eigen__StreamInterface* f0;
	int f1;
	char f2[4];
	};
	struct class_Eigen__TensorMap {
	global float* f0;
	struct Eigen__DSizes f1;
	char f2[4];
	};
	struct class_Eigen__StreamInterface {
	};
	struct Eigen__TensorEvaluator {
	struct Eigen__TensorEvaluator_0 f0;
	struct Eigen__TensorEvaluator_2 f1;
	};
	struct class_Eigen__array_nopointers {
	int f0[1];
	};
	struct Eigen__DSizes_nopointers {
	struct class_Eigen__array_nopointers f0;
	};
	struct Eigen__TensorEvaluator_0_nopointers {
	struct Eigen__DSizes_nopointers f0;
	};
	struct Eigen__internal__scalar_sqrt_op_nopointers {
	char f0;
	};
	struct Eigen__TensorEvaluator_3_nopointers {
	struct Eigen__DSizes_nopointers f0;
	};
	struct Eigen__TensorEvaluator_2_nopointers {
	struct Eigen__internal__scalar_sqrt_op_nopointers f0;
	struct Eigen__TensorEvaluator_3_nopointers f1;
	};
	struct Eigen__TensorEvaluator_nopointers {
	struct Eigen__TensorEvaluator_0_nopointers f0;
	struct Eigen__TensorEvaluator_2_nopointers f1;
	};

	float _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch);
	float _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch);
	float _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(struct Eigen__internal__scalar_sqrt_op* this, float* a, local int *scratch);
	float4 _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(float4* a, local int *scratch);
	float4 _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch);
	float4 _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch);
	float4 _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(struct Eigen__internal__scalar_sqrt_op* this, float4* a, local int *scratch);
	global float* _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(struct Eigen__TensorEvaluator_0* this, int index, local int *scratch);
	kernel void _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_(global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, uint eval_ptr0_offset, global float* eval_ptr1, uint eval_ptr1_offset, int size, local int *scratch);
	void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch);
	void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch);
	void _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(struct Eigen__TensorEvaluator_0* this, int index, float4* x, local int *scratch);
	void _ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(global float* to, float4* from, local int *scratch);

	kernel void _ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_(global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, uint eval_ptr0_offset, global float* eval_ptr1, uint eval_ptr1_offset, int size, local int *scratch) {
	eval_ptr1 += eval_ptr1_offset;
	eval_ptr0 += eval_ptr0_offset;
	struct Eigen__TensorEvaluator eval[1];
	eval[0].f0.f0 = 0;
	eval[0].f0.f1.f0.f0[0] = eval_nopointers[0].f0.f0.f0.f0[0];
	eval[0].f0.f2 = 0;
	eval[0].f0.f3 = 0;
	eval[0].f1.f0.f0 = eval_nopointers[0].f1.f0.f0;
	eval[0].f1.f1.f0 = 0;
	eval[0].f1.f1.f1.f0.f0[0] = eval_nopointers[0].f1.f1.f0.f0.f0[0];
	eval[0].f1.f1.f2 = 0;
	eval[0].f1.f1.f3 = 0;
	eval[0].f0.f0 = eval_ptr0;
	eval[0].f1.f1.f0 = eval_ptr1;

	char v20[1];
	int v10[1];
	int v11[1];
	int v12[1];
	int v13[1];
	int v14[1];
	int v15[1];
	int v16[1];
	int v17[1];
	int v18[1];
	int v19[1];
	int v22;
	int v23;
	int v25;
	int v26;
	int v28;
	int v29;
	int v30;
	int v34;
	int v35;
	int v36;
	int v43;
	int v45;
	int v47;
	int v48;
	int v50;
	int v51;
	int v53;
	int v54;
	int v57;
	int v59;
	int v60;
	int v61;
	int v63;
	int v64;
	int v65;
	int v67;
	int v68;
	int v71;
	int v73;
	int v74;
	int v75;
	int v9[1];
	struct Eigen__TensorEvaluator* v56;
	struct Eigen__TensorEvaluator* v70;
	struct Eigen__TensorEvaluator* v8[1];

	v1:;
	v17[0] = size;
	v22 = get_group_id(0);
	v23 = get_local_size(0);
	v25 = get_local_id(0);
	v26 = (v22 * v23) + v25;
	v18[0] = v26;
	v28 = get_local_size(0);
	v29 = get_num_groups(0);
	v30 = v28 * v29;
	v19[0] = v30;
	v20[0] = 1;
	v34 = v18[0];
	v35 = v17[0];
	v36 = v19[0];
	v8[0] = eval;
	v9[0] = v34;
	v10[0] = v35;
	v11[0] = v36;
	v12[0] = 4;
	v43 = v10[0];
	v45 = (v43 / 4) * 4;
	v13[0] = v45;
	v47 = v11[0];
	v48 = v47 * 4;
	v14[0] = v48;
	v50 = v9[0];
	v51 = v50 * 4;
	v15[0] = v51;
	goto v2;
	v2:;
	v53 = v15[0];
	v54 = v13[0];
	if (v53 < v54) {
	goto v3;
	} else {
	goto v4;
	}
	v3:;
	v56 = v8[0];
	v57 = v15[0];
	_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(v56, v57, scratch);
	v59 = v14[0];
	v60 = v15[0];
	v61 = v60 + v59;
	v15[0] = v61;
	goto v2;
	v4:;
	v63 = v13[0];
	v64 = v9[0];
	v65 = v63 + v64;
	v16[0] = v65;
	goto v5;
	v5:;
	v67 = v16[0];
	v68 = v10[0];
	if (v67 < v68) {
	goto v6;
	} else {
	goto v7;
	}
	v6:;
	v70 = v8[0];
	v71 = v16[0];
	_ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(v70, v71, scratch);
	v73 = v11[0];
	v74 = v16[0];
	v75 = v74 + v73;
	v16[0] = v75;
	goto v5;
	v7:;
	return;
	}
	void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalPacketEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch) {
	float v20;
	float v23;
	float v26;
	float v29;
	float* v19;
	float* v22;
	float* v25;
	float* v28;
	float4 v18;
	float4 v6[1];
	int v15;
	int v17;
	int v3[1];
	int v4[1];
	int v5[1];
	struct Eigen__TensorEvaluator* v2[1];
	struct Eigen__TensorEvaluator* v9;

	v1:;
	v2[0] = this;
	v3[0] = i;
	v9 = v2[0];
	v4[0] = 16;
	v5[0] = 16;
	v15 = v3[0];
	v17 = v3[0];
	v18 = _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i((&(v9[0].f1)), v17, scratch);
	v19 = (&(((float*)&v6[0])[0]));
	v20 = ((float*)&v18)[0];
	v19[0] = v20;
	v22 = (&(((float*)&v6[0])[1]));
	v23 = ((float*)&v18)[1];
	v22[0] = v23;
	v25 = (&(((float*)&v6[0])[2]));
	v26 = ((float*)&v18)[2];
	v25[0] = v26;
	v28 = (&(((float*)&v6[0])[3]));
	v29 = ((float*)&v18)[3];
	v28[0] = v29;
	_ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4((&(v9[0].f0)), v15, v6, scratch);
	return;
	}
	void _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE11writePacketILi16EEEviRK6float4(struct Eigen__TensorEvaluator_0* this, int index, float4* x, local int *scratch) {
	float4* v17;
	float4* v21;
	float4* v3[1];
	float4* v6[1];
	global float* v13;
	global float* v16;
	global float* v20;
	global float* v2[1];
	int v14;
	int v5[1];
	struct Eigen__TensorEvaluator_0* v10;
	struct Eigen__TensorEvaluator_0* v4[1];

	v1:;
	v4[0] = this;
	v5[0] = index;
	v6[0] = x;
	v10 = v4[0];
	v13 = (&(v10[0].f0))[0];
	v14 = v5[0];
	v16 = (&(v13[v14]));
	v17 = v6[0];
	v2[0] = v16;
	v3[0] = v17;
	v20 = v2[0];
	v21 = v3[0];
	_ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(v20, v21, scratch);
	return;
	}
	global float* _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi(struct Eigen__TensorEvaluator_0* this, int index, local int *scratch) {
	global float* v11;
	int v3[1];
	int v7;
	struct Eigen__TensorEvaluator_0* v2[1];
	struct Eigen__TensorEvaluator_0* v6;

	v1:;
	v2[0] = this;
	v3[0] = index;
	v6 = v2[0];
	v7 = v3[0];
	v11 = (&(v6[0].f0))[0];
	return (&(v11[v7]));
	}
	float _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch) {
	float v12;
	float v14;
	float v4[1];
	int v11;
	int v3[1];
	struct Eigen__TensorEvaluator_2* v2[1];
	struct Eigen__TensorEvaluator_2* v7;

	v1:;
	v2[0] = this;
	v3[0] = index;
	v7 = v2[0];
	v11 = v3[0];
	v12 = _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi((&(v7[0].f1)), v11, scratch);
	v4[0] = v12;
	v14 = _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf((&(v7[0].f0)), v4, scratch);
	return v14;
	}
	float4 _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_2* this, int index, local int *scratch) {
	float v15;
	float v18;
	float v21;
	float v24;
	float v28;
	float v31;
	float v34;
	float v37;
	float* v14;
	float* v17;
	float* v20;
	float* v23;
	float* v27;
	float* v30;
	float* v33;
	float* v36;
	float4 v13;
	float4 v26;
	float4 v2[1];
	float4 v39;
	float4 v5[1];
	int v12;
	int v4[1];
	struct Eigen__TensorEvaluator_2* v3[1];
	struct Eigen__TensorEvaluator_2* v8;

	v1:;
	v3[0] = this;
	v4[0] = index;
	v8 = v3[0];
	v12 = v4[0];
	v13 = _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i((&(v8[0].f1)), v12, scratch);
	v14 = (&(((float*)&v5[0])[0]));
	v15 = ((float*)&v13)[0];
	v14[0] = v15;
	v17 = (&(((float*)&v5[0])[1]));
	v18 = ((float*)&v13)[1];
	v17[0] = v18;
	v20 = (&(((float*)&v5[0])[2]));
	v21 = ((float*)&v13)[2];
	v20[0] = v21;
	v23 = (&(((float*)&v5[0])[3]));
	v24 = ((float*)&v13)[3];
	v23[0] = v24;
	v26 = _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_((&(v8[0].f0)), v5, scratch);
	v27 = (&(((float*)&v2[0])[0]));
	v28 = ((float*)&v26)[0];
	v27[0] = v28;
	v30 = (&(((float*)&v2[0])[1]));
	v31 = ((float*)&v26)[1];
	v30[0] = v31;
	v33 = (&(((float*)&v2[0])[2]));
	v34 = ((float*)&v26)[2];
	v33[0] = v34;
	v36 = (&(((float*)&v2[0])[3]));
	v37 = ((float*)&v26)[3];
	v36[0] = v37;
	v39 = v2[0];
	return v39;
	}
	void _ZN5Eigen15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS2_INS3_IKfLi1ELi1EiEELi16ES5_EEEEEENS_9GpuDeviceEE10evalScalarEi(struct Eigen__TensorEvaluator* this, int i, local int *scratch) {
	float v10;
	global float* v13;
	int v12;
	int v3[1];
	int v9;
	struct Eigen__TensorEvaluator* v2[1];
	struct Eigen__TensorEvaluator* v6;

	v1:;
	v2[0] = this;
	v3[0] = i;
	v6 = v2[0];
	v9 = v3[0];
	v10 = _ZNK5Eigen15TensorEvaluatorIKNS_18TensorCwiseUnaryOpINS_8internal14scalar_sqrt_opIfEEKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEEEENS_9GpuDeviceEE5coeffEi((&(v6[0].f1)), v9, scratch);
	v12 = v3[0];
	v13 = _ZN5Eigen15TensorEvaluatorINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE8coeffRefEi((&(v6[0].f0)), v12, scratch);
	v13[0] = v10;
	return;
	}
	void _ZN5Eigen8internal6pstoreIf6float4EEvPT_RKT0__gp(global float* to, float4* from, local int *scratch) {
	float4* v3[1];
	float4* v8;
	global float* v2[1];
	global float* v6;
	global float4* v7;

	v1:;
	v2[0] = to;
	v3[0] = from;
	v6 = v2[0];
	v7 = (global float4*)v6;
	v8 = v3[0];
	for(int __i=0; __i < 4; __i++) {;
	((global int )((global char)v7))[__i] = (( int )((char)v8))[__i];
	}
	;
	return;
	}
	float _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE5coeffEi(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch) {
	float v16;
	global float* v10;
	global float* v13;
	global float* v15;
	global float* v2[1];
	int v11;
	int v4[1];
	struct Eigen__TensorEvaluator_3* v3[1];
	struct Eigen__TensorEvaluator_3* v7;

	v1:;
	v3[0] = this;
	v4[0] = index;
	v7 = v3[0];
	v10 = (&(v7[0].f0))[0];
	v11 = v4[0];
	v13 = (&(v10[v11]));
	v2[0] = v13;
	v15 = v2[0];
	v16 = v15[0];
	return v16;
	}
	float4 _ZNK5Eigen15TensorEvaluatorIKNS_9TensorMapINS_6TensorIKfLi1ELi1EiEELi16ENS_11MakePointerEEENS_9GpuDeviceEE6packetILi16EEE6float4i(struct Eigen__TensorEvaluator_3* this, int index, local int *scratch) {
	float v18;
	float v22;
	float v26;
	float v30;
	float v33;
	float v36;
	float v39;
	float v42;
	float v46;
	float v49;
	float v52;
	float v55;
	float* v32;
	float* v35;
	float* v38;
	float* v41;
	float* v45;
	float* v48;
	float* v51;
	float* v54;
	float4 v2[1];
	float4 v31;
	float4 v44;
	float4 v4[1];
	float4 v57;
	global float* v12;
	global float* v15;
	global float* v17;
	global float* v19;
	global float* v23;
	global float* v27;
	global float* v3[1];
	int v13;
	int v6[1];
	struct Eigen__TensorEvaluator_3* v5[1];
	struct Eigen__TensorEvaluator_3* v9;

	v1:;
	v5[0] = this;
	v6[0] = index;
	v9 = v5[0];
	v12 = (&(v9[0].f0))[0];
	v13 = v6[0];
	v15 = (&(v12[v13]));
	v3[0] = v15;
	v17 = v3[0];
	v18 = v17[0];
	v19 = v3[0];
	v22 = (&(v19[1]))[0];
	v23 = v3[0];
	v26 = (&(v23[2]))[0];
	v27 = v3[0];
	v30 = (&(v27[3]))[0];
	v31 = (float4)(v18, v22, v26, v30);
	v32 = (&(((float*)&v2[0])[0]));
	v33 = ((float*)&v31)[0];
	v32[0] = v33;
	v35 = (&(((float*)&v2[0])[1]));
	v36 = ((float*)&v31)[1];
	v35[0] = v36;
	v38 = (&(((float*)&v2[0])[2]));
	v39 = ((float*)&v31)[2];
	v38[0] = v39;
	v41 = (&(((float*)&v2[0])[3]));
	v42 = ((float*)&v31)[3];
	v41[0] = v42;
	v44 = v2[0];
	v45 = (&(((float*)&v4[0])[0]));
	v46 = ((float*)&v44)[0];
	v45[0] = v46;
	v48 = (&(((float*)&v4[0])[1]));
	v49 = ((float*)&v44)[1];
	v48[0] = v49;
	v51 = (&(((float*)&v4[0])[2]));
	v52 = ((float*)&v44)[2];
	v51[0] = v52;
	v54 = (&(((float*)&v4[0])[3]));
	v55 = ((float*)&v44)[3];
	v54[0] = v55;
	v57 = v4[0];
	return v57;
	}
	float4 _ZNK5Eigen8internal14scalar_sqrt_opIfE8packetOpI6float4EET_RKS5_(struct Eigen__internal__scalar_sqrt_op* this, float4* a, local int *scratch) {
	float v12;
	float v15;
	float v18;
	float v21;
	float* v10;
	float* v14;
	float* v17;
	float* v20;
	float4 v23;
	float4 v2[1];
	float4 v9;
	float4* v4[1];
	float4* v8;
	struct Eigen__internal__scalar_sqrt_op* v3[1];
	struct Eigen__internal__scalar_sqrt_op* v7;

	v1:;
	v3[0] = this;
	v4[0] = a;
	v7 = v3[0];
	v8 = v4[0];
	v9 = _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(v8, scratch);
	v10 = (&(((float*)&v2[0])[0]));
	v12 = ((float*)&v9)[0];
	v10[0] = v12;
	v14 = (&(((float*)&v2[0])[1]));
	v15 = ((float*)&v9)[1];
	v14[0] = v15;
	v17 = (&(((float*)&v2[0])[2]));
	v18 = ((float*)&v9)[2];
	v17[0] = v18;
	v20 = (&(((float*)&v2[0])[3]));
	v21 = ((float*)&v9)[3];
	v20[0] = v21;
	v23 = v2[0];
	return v23;
	}
	float _ZNK5Eigen8internal14scalar_sqrt_opIfEclERKf(struct Eigen__internal__scalar_sqrt_op* this, float* a, local int *scratch) {
	float v11;
	float v13;
	float* v10;
	float* v2[1];
	float* v4[1];
	float* v8;
	struct Eigen__internal__scalar_sqrt_op* v3[1];
	struct Eigen__internal__scalar_sqrt_op* v7;

	v1:;
	v3[0] = this;
	v4[0] = a;
	v7 = v3[0];
	v8 = v4[0];
	v2[0] = v8;
	v10 = v2[0];
	v11 = v10[0];
	v13 = sqrt(v11);
	return (float)v13;
	}
	float4 _ZN5Eigen8internal5psqrtI6float4EET_RKS3_(float4* a, local int *scratch) {
	float v12;
	float v13;
	float v16;
	float v17;
	float v20;
	float v21;
	float v24;
	float v27;
	float v30;
	float v33;
	float v8;
	float v9;
	float* v23;
	float* v26;
	float* v29;
	float* v32;
	float4 v22;
	float4 v2[1];
	float4 v35;
	float4* v10;
	float4* v14;
	float4* v18;
	float4* v3[1];
	float4* v5;

	v1:;
	v3[0] = a;
	v5 = v3[0];
	v8 = (&(((float*)&v5[0])[0]))[0];
	v9 = sqrt(v8);
	v10 = v3[0];
	v12 = (&(((float*)&v10[0])[1]))[0];
	v13 = sqrt(v12);
	v14 = v3[0];
	v16 = (&(((float*)&v14[0])[2]))[0];
	v17 = sqrt(v16);
	v18 = v3[0];
	v20 = (&(((float*)&v18[0])[3]))[0];
	v21 = sqrt(v20);
	v22 = (float4)(v9, v13, v17, v21);
	v23 = (&(((float*)&v2[0])[0]));
	v24 = ((float*)&v22)[0];
	v23[0] = v24;
	v26 = (&(((float*)&v2[0])[1]));
	v27 = ((float*)&v22)[1];
	v26[0] = v27;
	v29 = (&(((float*)&v2[0])[2]));
	v30 = ((float*)&v22)[2];
	v29[0] = v30;
	v32 = (&(((float*)&v2[0])[3]));
	v33 = ((float*)&v22)[3];
	v32[0] = v33;
	v35 = v2[0];
	return v35;
	}