AllanChen · August 16, 2019 09:37
diff --git a/gistfile1.txt b/gistfile1.txt
 #import "AppDelegate.h"
 #include <stdio.h>
 @interface AppDelegate ()
 @end

 @implementation AppDelegate
 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
 //    [self namolData];
 //    [self test];
 //    [self test2];
 //    [self neonTest];
 //    [self test_cpp];
    
 //    [self neonPractice];
    [self neonPractice2];
 //    [self test_2];
    return YES;
 }


 - (void)test_2{
    
    int channel = 8;
    int width = 8;
    int height = 8;
    
    int value = width * height;
    float *input_value_f = new float[value];
    for(int i=0; i<value; i++){
        input_value_f[i] = (float)i;
    }
    
    VenusCPU::Mat inMat = VenusCPU::Mat::from_float_array(input_value_f, width, height, 4);
    VenusCPU::Mat outMat;
    
 //#if __ARM_NEON
 //#endif
    
    
 //    for (int c=0; c<inMat.shape.c; c++) {
 //        float * p0 = inMat.channel(c);
 //        for (int h=0; h<inMat.shape.h; h++){
 //            for(int w=0; w<inMat.shape.w; w++){
 //                    float v0 = *(p0+ h * inMat.shape.w + w);
 //                    LOGV("c %d h %d w %d  -- value -- %f", c, h, w, v0);
 //            }
 //        }
 //    }
    
 //    mat_chwtoc8hw8_int32(inMat,outMat);
 //    mat_chwtoc4hw4_int32(inMat, outMat);
    for (int c=0; c< outMat.shape.c; c++) {
        float * p0 = outMat.channel(c);
        for (int h=0; h<outMat.shape.h; h++){
            for(int w=0; w<outMat.shape.w; w++){
                float v0 = *(p0+ h * outMat.shape.w + w);
                LOGV("c %d h %d w %d  -- value -- %f", c, h, w, v0);
            }
        }
    }
 }

 void mat_chwtoc4hw4_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
    const int c = inMat.shape.c;
    const int h = inMat.shape.h;
    const int w = inMat.shape.w;
    
    
    outMat.create(VenusCPU::TensorShape(c / 4, h, w * 4, VenusCPU::TensorShape::Layout::C4HW4, sizeof(int), inMat.shape.step_size));
    if (outMat.empty())
        assert(false && "creat mat failed!");
    
    const int C = inMat.shape.c;
    const int H = inMat.shape.h;
    const int W = inMat.shape.w;
    
    for (int z = 0; z < C / 4; z++) {
        int * input0 = inMat.channel(z * 4 + 0);
        int * input1 = inMat.channel(z * 4 + 1);
        int * input2 = inMat.channel(z * 4 + 2);
        int * input3 = inMat.channel(z * 4 + 3);
        
        int * output0  = outMat.channel(z);
        
        for (int i = 0; i < H * W; i++) {
            output0[i*4 + 0] = input0[i];
            output0[i*4 + 1] = input1[i];
            output0[i*4 + 2] = input2[i];
            output0[i*4 + 3] = input3[i];
        }
    }
 }

 void mat_chwtoc8hw8_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
    const int c = inMat.shape.c;
    const int h = inMat.shape.h;
    const int w = inMat.shape.w;
    
    
    outMat.create(VenusCPU::TensorShape(c / 8, h, w * 8, VenusCPU::TensorShape::Layout::C8HW8, sizeof(int), inMat.shape.step_size));
    if (outMat.empty())
        assert(false && "creat mat failed!");
    
    const int C = inMat.shape.c;
    const int H = inMat.shape.h;
    const int W = inMat.shape.w;
    
    for (int z = 0; z < C / 8; z++) {
        int * input0 = inMat.channel(z * 8 + 0);
        int * input1 = inMat.channel(z * 8 + 1);
        int * input2 = inMat.channel(z * 8 + 2);
        int * input3 = inMat.channel(z * 8 + 3);
        int * input4 = inMat.channel(z * 8 + 4);
        int * input5 = inMat.channel(z * 8 + 5);
        int * input6 = inMat.channel(z * 8 + 6);
        int * input7 = inMat.channel(z * 8 + 7);
        
        int * output0  = outMat.channel(z);
        
        for (int i = 0; i < H * W; i++) {
            output0[i*8 + 0] = input0[i];
            output0[i*8 + 1] = input1[i];
            output0[i*8 + 2] = input2[i];
            output0[i*8 + 3] = input3[i];
            output0[i*8 + 4] = input4[i];
            output0[i*8 + 5] = input5[i];
            output0[i*8 + 6] = input6[i];
            output0[i*8 + 7] = input7[i];
        }
    }
 }


 - (void)test_cpp{
    double tic = CACurrentMediaTime();
    int m_total = 1024 * 1024 *100;
 //    float *input = (float *)malloc(m_total);
 //    float *output = (float *)malloc(m_total);

    float *input = new float[m_total];
    float *output = new float[m_total];
    
    for(int i = 0; i< m_total; i++){
        input[i] = (float)100;
    }
    
    for(int i=0; i<m_total;i++){
        output[i] = input[i] * 2 +3;
    }
    
    printf("cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
 }

 - (void)neonPractice2{
    int value = 10 * 10;
    int *a = new int[value];
    float *input_value_f = new float[value];
    
    
    for(int i=0; i<100; i++){
        input_value_f[i] = (int)i;
 //        a[i]=0;
    }
    int16_t *output_value_f = reinterpret_cast<int16_t*>(a);
    
    int nn = value >> 2;
    float alpha = 1.0f;
 #if __ARM_NEON
    const float *inputptr = input_value_f;
    int16_t *outputptr = output_value_f;
    for (int j = 0; j < nn; ++j) {
        float32x4_t _alpha = vdupq_n_f32(alpha);
        int16x4x4_t output_i16quant;
        int16x4x4_t output_i16quant2;
        
        float32x4_t input0_f32  = vld1q_f32(inputptr);
        inputptr += 4;
        float32x4_t input0_f32mulstep = vmulq_f32(input0_f32, _alpha);
        int32x4_t   input0_i32mulstep = vcvtq_s32_f32(input0_f32mulstep);
        output_i16quant.val[0] = vmovn_s32(input0_i32mulstep);
        
        float32x4_t input1_f32  = vld1q_f32(inputptr);
        inputptr += 4;
        float32x4_t input1_f32mulstep = vmulq_f32(input1_f32, _alpha);
        int32x4_t   input1_i32mulstep = vcvtq_s32_f32(input1_f32mulstep);
        output_i16quant.val[1] = vmovn_s32(input1_i32mulstep);
        
        float32x4_t input2_f32  = vld1q_f32(inputptr);
        inputptr += 4;
        float32x4_t input2_f32mulstep = vmulq_f32(input2_f32, _alpha);
        int32x4_t   input2_i32mulstep = vcvtq_s32_f32(input2_f32mulstep);
        output_i16quant.val[2] = vmovn_s32(input2_i32mulstep);
        
        float32x4_t input3_f32  = vld1q_f32(inputptr);
        inputptr += 4;
        float32x4_t input3_f32mulstep = vmulq_f32(input3_f32, _alpha);
        int32x4_t   input3_i32mulstep = vcvtq_s32_f32(input3_f32mulstep);
        output_i16quant.val[3] = vmovn_s32(input3_i32mulstep);
        
 //        float32x4_t input4_f32  = vld1q_f32(inputptr);
 //        inputptr += 4;
 //        float32x4_t input4_f32mulstep = vmulq_f32(input4_f32, _alpha);
 //        int32x4_t   input4_i32mulstep = vcvtq_s32_f32(input4_f32mulstep);
 //        output_i16quant2.val[0] = vmovn_s32(input4_i32mulstep);
 //
 //        float32x4_t input5_f32  = vld1q_f32(inputptr);
 //        inputptr += 4;
 //        float32x4_t input5_f32mulstep = vmulq_f32(input5_f32, _alpha);
 //        int32x4_t   input5_i32mulstep = vcvtq_s32_f32(input5_f32mulstep);
 //        output_i16quant2.val[1] = vmovn_s32(input5_i32mulstep);
 //
 //        float32x4_t input6_f32  = vld1q_f32(inputptr);
 //        inputptr += 4;
 //        float32x4_t input6_f32mulstep = vmulq_f32(input6_f32, _alpha);
 //        int32x4_t   input6_i32mulstep = vcvtq_s32_f32(input6_f32mulstep);
 //        output_i16quant2.val[2] = vmovn_s32(input6_i32mulstep);
 //
 //        float32x4_t input7_f32  = vld1q_f32(inputptr);
 //        inputptr += 4;
 //        float32x4_t input7_f32mulstep = vmulq_f32(input7_f32, _alpha);
 //        int32x4_t   input7_i32mulstep = vcvtq_s32_f32(input7_f32mulstep);
 //        output_i16quant2.val[3] = vmovn_s32(input7_i32mulstep);
        
        
        
 //        vtrn1_s16(output_i16quant2.val[0], output_i16quant.val[0]);
        
 //        vtrn1_s16(output_i16quant2.val[2], output_i16quant.val[2]);
 //        vtrn1_s16(output_i16quant2.val[3], output_i16quant.val[3]);
        
        vst4_s16(outputptr, output_i16quant);
        
 //        vst4_s16(outputptr, output_i16quant2);
        outputptr +=16;
        
    }
 #endif
    
    for(int i=0; i< 100; i++){
 //        printf("neon --output %d \n",(int)output_value_f[i]);
 //          printf("neon -- %" PRIu16 "\n",output_value_f[i]);
        printf("%hu\n",output_value_f[i]);
    }
    
 }


 - (void)neonPractice{
    int value = 10 * 10;
    
    int *input_value_f = new int[value];
    int *output_value_f = new int[value];
    
    for(int i=0; i<100; i++){
        input_value_f[i] = (int)i+100;
    }
    
    int nn = value >> 2;
    int alpha = (int)1;
 #if __ARM_NEON
    const int *inputptr = input_value_f;
    int *outputptr = output_value_f;
    int16x4_t _alpha = vdup_n_s16(alpha);
    
 #if __aarch64__
    
    if(nn > 0){
        asm volatile(
            
 #ifdef USE_PRFM
            "prfm    pldl1keep, [%2, #128]      \n"
 #endif
            "0:              \n"
 //            "prfm   pldl1keep, [%4, #128]     \n"
            "ld1    {v0.8h, v1.8h},[%4], #32    \n"
 //            "ld1     {v0.8h, v1.8h}, [%2], #32  \n"
            "smull v2.4s, %5.4h, v0.h[4]          \n"
 //            "smull   v2.4s,  %5.4h, v1.4h \n"
 //            "fadd v2.4s, v0.4s, v1.4s \n"
            "mov v2.4s, v0.4s       \n"
            "subs %w0, %w0, #1     \n"
            "st1 {v2.4s},[%1], #16 \n"
            "bne 0b"
            :
            "=r"(nn),
            "=r"(outputptr)
            :
            "0"(nn),
            "1"(outputptr),
            "r"(inputptr), //%4
            "w"(_alpha) // %5
            :"cc","memory","v0","v1","v2"
        );
    }

 #else
 #endif
 #endif
    
    for(int i=0; i< 100/2; i++){
        printf("neon --output %d",outputptr[i]);
    }
    
 }


 - (void)neonTest{
    double tic = CACurrentMediaTime();
    int total = 1024 * 1024 * 100;
 #if __ARM_NEON
    
    float *input = new float[total];
    float *output = new float[total];
    float alpha = 1.0f;
    float beta = 2.0f;
    int nn = total >> 2;
    for(int i = 0; i<total; i++){
        input[i] = (float)100;
    }
    float32x4_t _alpha = vdupq_n_f32(alpha);
    float32x4_t _beta = vdupq_n_f32(beta);
 #else
    
 #endif

 #if __ARM_NEON
    const float *inputptr = input;
    float *outputptr = input;
 #if __aarch64__
 //    for(int i=0; i<nn; i++){
 //        printf("INPUT ----%f",inputptr[i]);
 //    }
    if(nn > 0){
        asm volatile(
                     "0:        \n"
                     "mov    v0.4s,  %5.4s   \n"
                     "mov    v1.4s,  %6.4s   \n"
                     "prfm  pldl1keep,  [%4, #64]   \n"
                     "ld1   {v2.4s},    [%4], #16   \n"
                     "fmul  v3.4s, v2.4s, v0.4s     \n"
                     "fadd  v4.4s, v3.4s, v1.4s     \n"
                     "subs  %w0, %w0, #1            \n"
                     "st1   {v4.4s}, [%1], #16      \n"
                     "bne 0b"
                     :
                     "=r"(nn),      //%0
                     "=r"(outputptr)//%1
                     :
                     "0"(nn),
                     "1"(outputptr),
                     "r"(inputptr), //%4
                     "w"(_alpha),   //%5
                     "w"(_beta)     //%6
                     :"cc","memory","v0","v1","v2","v3","v4"
        );
    }
 #else
 #endif// __aarch64
    for (int i=0; i<2; i++){
        int zz = total >> 2;
        outputptr[i] = inputptr [i + zz * 4] *alpha +beta;
    }
 #endif// __ARM_NEON
 printf("neon -- cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
 }


 - (void)test{
    float *input = (float *)malloc(3 * 42);
    float *output = (float *)malloc(3 *42);
    int size = 3 * 42;
    float alpha = 1.0f;
    float beta = 1.0f;
    float32x4_t _alpha = vdupq_n_f32(alpha);
    float32x4_t _beta = vdupq_n_f32(beta);
    
    for (int c = 0; c < 1; c ++){
        for (int i = 0 ;  i < 126; i++) {
            input[126 * c + i] = (float)0;
        }
    }
 #if __ARM_NEON
    float *outptr = output;
    for(int y=0; y< 3; y++){
        for(int x=0; x<42; x++){
            float32x4_t v0 = vld1q_f32(input);
            v_print(v0, "input");
            float32x4_t mul = vmulq_f32(v0, _alpha);
            v_print(mul, "mul");
            float32x4_t sum = vaddq_f32(mul, _beta);
            v_print(sum, "sum");
            vst1q_f32(outptr, sum);
            input +=4;
            outptr +=4;
            
            for (int r = 0; r<8 ;r++){
                printf("result -- %f \n",output[r]);
            }
 //            outputIndex += 4;
        }
        
        for(int remain = 0; remain < 2; remain ++){
            
        }
    }
    
 #else
    
 #endif
 }


 - (void)namolData{
    float *input = (float *)malloc(16 * 3 * 42);
    float *output = (float *)malloc(16 * 3 * 42);
    int index = 16;
    int outputIndex = 0;
    for (int c = 0; c < index; c ++){
        for (int i = 0 ;  i < 126; i++) {
            input[126 * c + i] = (float)c;
        }
    }
    
    int C = 16;
    int size = 3 * 42;
    float alpha = 1.0f;
    float beta = 1.0f;
    float32x4_t _alpha = vdupq_n_f32(alpha);
    float32x4_t _beta = vdupq_n_f32(beta);
    
 #if __ARM_NEON
    int nn = size >> 2;
    int remain = size - (nn << 2 );
    float *ptr = input;
    float *outprt = output;
    
            for(int y = 0; y<3; ++y){
    
                for (int x = 0; x < nn; ++x){
                    
                    float32x4_t v0 = vld1q_f32(ptr);
                    v_print(v0, "input");
                    float32x4_t mul = vmulq_f32(v0, _alpha);
                    v_print(mul, "mul");
                    float32x4_t sum = vaddq_f32(mul, _beta);
                    v_print(sum, "sum");
                    vst1q_f32(outprt + outputIndex, sum);
                    ptr +=4;
    
                    for (int r = 0; r<8 ;r++){
                        printf("result -- %f \n",outprt[r]);
                    }
                    outputIndex += 4;
                }
    
                
    //            for (int x=0; x<10; ++x) {
    //                vst1q_f32(output, sum);
    //                output ++;
    //            }
    
                for(int i = 40; i<42 ;++i){
                    outprt[i] = ptr[i] * alpha +beta;
                }
                
                for (int r = 0; r<42 ;r++){
                    printf("result -- %f \n",outprt[r]);
                }
    
                printf("width end one loop");
            }
 #else
    
 #endif
 }

 - (void)neonData{
    float *input = (float *)malloc(16 * 3 * 42);
    float *output = (float *)malloc(16 * 3 * 42);
    int index = 16;
    for (int c = 0; c < index; c ++){
        for (int i = 0 ;  i < 126; i++) {
            input[126 * c + i] = (float)c;
        }
    }
    
 #if __ARM_NEON
    int C = 16;
    int size = 3 * 42;
    float alpha = 1.0f;
    float beta = 1.0f;
    int nn = size >> 2;
    int remain = size - (nn << 2 );
    float *ptr = input;
    float *outprt = output;
    float32x4_t _alpha = vdupq_n_f32(alpha);
    float32x4_t _beta = vdupq_n_f32(beta);
 #else
    int remain = size;
 #endif
    
 #if __ARM_NEON
    
 #if __aarch64__
    if(nn > 0){
        asm volatile(
                     "0:          \n"
                     "mov    v0.4s,  %4.4s   \n"
                     "mov    v1.4s,  %5.4s   \n"
                     "prfm   pldl1keep,  [%1, #64]  \n"
                     "ld1   {v2.4s},   [%1], #16     \n"
                     "fmul  v2.4s, v2.4s, v0.4s     \n"
                     "fadd  v3.4s, v2.4s, v1.4s     \n"
                     "subs  %w0, %w0, #1            \n"
                     "st1   {v3.4s}, [%3], #16      \n"
                     "bne   0b                      \n"
                     :
                     "=r"(nn), //%0
                     "=r"(ptr) //%1
                     :
                     "0"(nn),
                     "1"(output), // %3
                     "w"(_alpha), // %4
                     "w"(_beta)   // %5
                     : "cc","memory","v0","v1","v2","v3"
                     );
    }
    
 #else
    
 #endif //__aarch64__
    for (int i=0; i<42; i++){
        printf("OUTPUT -- %f",output[i]);
    }
 #endif //__ARM_NEON
    
 }

 void v_print(float32x4_t vec, std::string x)
 {
    float a[4];
    vst1q_f32((float32_t *)a,vec);
    printf("%s - v0 -- %f -- %f -- %f -- %f \n",x.c_str(),a[0],a[1],a[2],a[3]);
    
 }

 @end
	#import "AppDelegate.h"
	#include <stdio.h>
	@interface AppDelegate ()
	@end

	@implementation AppDelegate
	- (BOOL)application:(UIApplication )application didFinishLaunchingWithOptions:(NSDictionary )launchOptions {
	// [self namolData];
	// [self test];
	// [self test2];
	// [self neonTest];
	// [self test_cpp];

	// [self neonPractice];
	[self neonPractice2];
	// [self test_2];
	return YES;
	}


	- (void)test_2{

	int channel = 8;
	int width = 8;
	int height = 8;

	int value = width * height;
	float *input_value_f = new float[value];
	for(int i=0; i<value; i++){
	input_value_f[i] = (float)i;
	}

	VenusCPU::Mat inMat = VenusCPU::Mat::from_float_array(input_value_f, width, height, 4);
	VenusCPU::Mat outMat;

	//#if __ARM_NEON
	//#endif


	// for (int c=0; c<inMat.shape.c; c++) {
	// float * p0 = inMat.channel(c);
	// for (int h=0; h<inMat.shape.h; h++){
	// for(int w=0; w<inMat.shape.w; w++){
	// float v0 = (p0+ h inMat.shape.w + w);
	// LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0);
	// }
	// }
	// }

	// mat_chwtoc8hw8_int32(inMat,outMat);
	// mat_chwtoc4hw4_int32(inMat, outMat);
	for (int c=0; c< outMat.shape.c; c++) {
	float * p0 = outMat.channel(c);
	for (int h=0; h<outMat.shape.h; h++){
	for(int w=0; w<outMat.shape.w; w++){
	float v0 = (p0+ h outMat.shape.w + w);
	LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0);
	}
	}
	}
	}

	void mat_chwtoc4hw4_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
	const int c = inMat.shape.c;
	const int h = inMat.shape.h;
	const int w = inMat.shape.w;


	outMat.create(VenusCPU::TensorShape(c / 4, h, w * 4, VenusCPU::TensorShape::Layout::C4HW4, sizeof(int), inMat.shape.step_size));
	if (outMat.empty())
	assert(false && "creat mat failed!");

	const int C = inMat.shape.c;
	const int H = inMat.shape.h;
	const int W = inMat.shape.w;

	for (int z = 0; z < C / 4; z++) {
	int * input0 = inMat.channel(z * 4 + 0);
	int * input1 = inMat.channel(z * 4 + 1);
	int * input2 = inMat.channel(z * 4 + 2);
	int * input3 = inMat.channel(z * 4 + 3);

	int * output0 = outMat.channel(z);

	for (int i = 0; i < H * W; i++) {
	output0[i*4 + 0] = input0[i];
	output0[i*4 + 1] = input1[i];
	output0[i*4 + 2] = input2[i];
	output0[i*4 + 3] = input3[i];
	}
	}
	}

	void mat_chwtoc8hw8_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
	const int c = inMat.shape.c;
	const int h = inMat.shape.h;
	const int w = inMat.shape.w;


	outMat.create(VenusCPU::TensorShape(c / 8, h, w * 8, VenusCPU::TensorShape::Layout::C8HW8, sizeof(int), inMat.shape.step_size));
	if (outMat.empty())
	assert(false && "creat mat failed!");

	const int C = inMat.shape.c;
	const int H = inMat.shape.h;
	const int W = inMat.shape.w;

	for (int z = 0; z < C / 8; z++) {
	int * input0 = inMat.channel(z * 8 + 0);
	int * input1 = inMat.channel(z * 8 + 1);
	int * input2 = inMat.channel(z * 8 + 2);
	int * input3 = inMat.channel(z * 8 + 3);
	int * input4 = inMat.channel(z * 8 + 4);
	int * input5 = inMat.channel(z * 8 + 5);
	int * input6 = inMat.channel(z * 8 + 6);
	int * input7 = inMat.channel(z * 8 + 7);

	int * output0 = outMat.channel(z);

	for (int i = 0; i < H * W; i++) {
	output0[i*8 + 0] = input0[i];
	output0[i*8 + 1] = input1[i];
	output0[i*8 + 2] = input2[i];
	output0[i*8 + 3] = input3[i];
	output0[i*8 + 4] = input4[i];
	output0[i*8 + 5] = input5[i];
	output0[i*8 + 6] = input6[i];
	output0[i*8 + 7] = input7[i];
	}
	}
	}


	- (void)test_cpp{
	double tic = CACurrentMediaTime();
	int m_total = 1024 * 1024 *100;
	// float input = (float )malloc(m_total);
	// float output = (float )malloc(m_total);

	float *input = new float[m_total];
	float *output = new float[m_total];

	for(int i = 0; i< m_total; i++){
	input[i] = (float)100;
	}

	for(int i=0; i<m_total;i++){
	output[i] = input[i] * 2 +3;
	}

	printf("cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
	}

	- (void)neonPractice2{
	int value = 10 * 10;
	int *a = new int[value];
	float *input_value_f = new float[value];


	for(int i=0; i<100; i++){
	input_value_f[i] = (int)i;
	// a[i]=0;
	}
	int16_t output_value_f = reinterpret_cast<int16_t>(a);

	int nn = value >> 2;
	float alpha = 1.0f;
	#if __ARM_NEON
	const float *inputptr = input_value_f;
	int16_t *outputptr = output_value_f;
	for (int j = 0; j < nn; ++j) {
	float32x4_t _alpha = vdupq_n_f32(alpha);
	int16x4x4_t output_i16quant;
	int16x4x4_t output_i16quant2;

	float32x4_t input0_f32 = vld1q_f32(inputptr);
	inputptr += 4;
	float32x4_t input0_f32mulstep = vmulq_f32(input0_f32, _alpha);
	int32x4_t input0_i32mulstep = vcvtq_s32_f32(input0_f32mulstep);
	output_i16quant.val[0] = vmovn_s32(input0_i32mulstep);

	float32x4_t input1_f32 = vld1q_f32(inputptr);
	inputptr += 4;
	float32x4_t input1_f32mulstep = vmulq_f32(input1_f32, _alpha);
	int32x4_t input1_i32mulstep = vcvtq_s32_f32(input1_f32mulstep);
	output_i16quant.val[1] = vmovn_s32(input1_i32mulstep);

	float32x4_t input2_f32 = vld1q_f32(inputptr);
	inputptr += 4;
	float32x4_t input2_f32mulstep = vmulq_f32(input2_f32, _alpha);
	int32x4_t input2_i32mulstep = vcvtq_s32_f32(input2_f32mulstep);
	output_i16quant.val[2] = vmovn_s32(input2_i32mulstep);

	float32x4_t input3_f32 = vld1q_f32(inputptr);
	inputptr += 4;
	float32x4_t input3_f32mulstep = vmulq_f32(input3_f32, _alpha);
	int32x4_t input3_i32mulstep = vcvtq_s32_f32(input3_f32mulstep);
	output_i16quant.val[3] = vmovn_s32(input3_i32mulstep);

	// float32x4_t input4_f32 = vld1q_f32(inputptr);
	// inputptr += 4;
	// float32x4_t input4_f32mulstep = vmulq_f32(input4_f32, _alpha);
	// int32x4_t input4_i32mulstep = vcvtq_s32_f32(input4_f32mulstep);
	// output_i16quant2.val[0] = vmovn_s32(input4_i32mulstep);
	//
	// float32x4_t input5_f32 = vld1q_f32(inputptr);
	// inputptr += 4;
	// float32x4_t input5_f32mulstep = vmulq_f32(input5_f32, _alpha);
	// int32x4_t input5_i32mulstep = vcvtq_s32_f32(input5_f32mulstep);
	// output_i16quant2.val[1] = vmovn_s32(input5_i32mulstep);
	//
	// float32x4_t input6_f32 = vld1q_f32(inputptr);
	// inputptr += 4;
	// float32x4_t input6_f32mulstep = vmulq_f32(input6_f32, _alpha);
	// int32x4_t input6_i32mulstep = vcvtq_s32_f32(input6_f32mulstep);
	// output_i16quant2.val[2] = vmovn_s32(input6_i32mulstep);
	//
	// float32x4_t input7_f32 = vld1q_f32(inputptr);
	// inputptr += 4;
	// float32x4_t input7_f32mulstep = vmulq_f32(input7_f32, _alpha);
	// int32x4_t input7_i32mulstep = vcvtq_s32_f32(input7_f32mulstep);
	// output_i16quant2.val[3] = vmovn_s32(input7_i32mulstep);



	// vtrn1_s16(output_i16quant2.val[0], output_i16quant.val[0]);

	// vtrn1_s16(output_i16quant2.val[2], output_i16quant.val[2]);
	// vtrn1_s16(output_i16quant2.val[3], output_i16quant.val[3]);

	vst4_s16(outputptr, output_i16quant);

	// vst4_s16(outputptr, output_i16quant2);
	outputptr +=16;

	}
	#endif

	for(int i=0; i< 100; i++){
	// printf("neon --output %d \n",(int)output_value_f[i]);
	// printf("neon -- %" PRIu16 "\n",output_value_f[i]);
	printf("%hu\n",output_value_f[i]);
	}

	}


	- (void)neonPractice{
	int value = 10 * 10;

	int *input_value_f = new int[value];
	int *output_value_f = new int[value];

	for(int i=0; i<100; i++){
	input_value_f[i] = (int)i+100;
	}

	int nn = value >> 2;
	int alpha = (int)1;
	#if __ARM_NEON
	const int *inputptr = input_value_f;
	int *outputptr = output_value_f;
	int16x4_t _alpha = vdup_n_s16(alpha);

	#if __aarch64__

	if(nn > 0){
	asm volatile(

	#ifdef USE_PRFM
	"prfm pldl1keep, [%2, #128] \n"
	#endif
	"0: \n"
	// "prfm pldl1keep, [%4, #128] \n"
	"ld1 {v0.8h, v1.8h},[%4], #32 \n"
	// "ld1 {v0.8h, v1.8h}, [%2], #32 \n"
	"smull v2.4s, %5.4h, v0.h[4] \n"
	// "smull v2.4s, %5.4h, v1.4h \n"
	// "fadd v2.4s, v0.4s, v1.4s \n"
	"mov v2.4s, v0.4s \n"
	"subs %w0, %w0, #1 \n"
	"st1 {v2.4s},[%1], #16 \n"
	"bne 0b"
	:
	"=r"(nn),
	"=r"(outputptr)
	:
	"0"(nn),
	"1"(outputptr),
	"r"(inputptr), //%4
	"w"(_alpha) // %5
	:"cc","memory","v0","v1","v2"
	);
	}

	#else
	#endif
	#endif

	for(int i=0; i< 100/2; i++){
	printf("neon --output %d",outputptr[i]);
	}

	}


	- (void)neonTest{
	double tic = CACurrentMediaTime();
	int total = 1024 * 1024 * 100;
	#if __ARM_NEON

	float *input = new float[total];
	float *output = new float[total];
	float alpha = 1.0f;
	float beta = 2.0f;
	int nn = total >> 2;
	for(int i = 0; i<total; i++){
	input[i] = (float)100;
	}
	float32x4_t _alpha = vdupq_n_f32(alpha);
	float32x4_t _beta = vdupq_n_f32(beta);
	#else

	#endif

	#if __ARM_NEON
	const float *inputptr = input;
	float *outputptr = input;
	#if __aarch64__
	// for(int i=0; i<nn; i++){
	// printf("INPUT ----%f",inputptr[i]);
	// }
	if(nn > 0){
	asm volatile(
	"0: \n"
	"mov v0.4s, %5.4s \n"
	"mov v1.4s, %6.4s \n"
	"prfm pldl1keep, [%4, #64] \n"
	"ld1 {v2.4s}, [%4], #16 \n"
	"fmul v3.4s, v2.4s, v0.4s \n"
	"fadd v4.4s, v3.4s, v1.4s \n"
	"subs %w0, %w0, #1 \n"
	"st1 {v4.4s}, [%1], #16 \n"
	"bne 0b"
	:
	"=r"(nn), //%0
	"=r"(outputptr)//%1
	:
	"0"(nn),
	"1"(outputptr),
	"r"(inputptr), //%4
	"w"(_alpha), //%5
	"w"(_beta) //%6
	:"cc","memory","v0","v1","v2","v3","v4"
	);
	}
	#else
	#endif// __aarch64
	for (int i=0; i<2; i++){
	int zz = total >> 2;
	outputptr[i] = inputptr [i + zz * 4] *alpha +beta;
	}
	#endif// __ARM_NEON
	printf("neon -- cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
	}


	- (void)test{
	float input = (float )malloc(3 * 42);
	float output = (float )malloc(3 *42);
	int size = 3 * 42;
	float alpha = 1.0f;
	float beta = 1.0f;
	float32x4_t _alpha = vdupq_n_f32(alpha);
	float32x4_t _beta = vdupq_n_f32(beta);

	for (int c = 0; c < 1; c ++){
	for (int i = 0 ; i < 126; i++) {
	input[126 * c + i] = (float)0;
	}
	}
	#if __ARM_NEON
	float *outptr = output;
	for(int y=0; y< 3; y++){
	for(int x=0; x<42; x++){
	float32x4_t v0 = vld1q_f32(input);
	v_print(v0, "input");
	float32x4_t mul = vmulq_f32(v0, _alpha);
	v_print(mul, "mul");
	float32x4_t sum = vaddq_f32(mul, _beta);
	v_print(sum, "sum");
	vst1q_f32(outptr, sum);
	input +=4;
	outptr +=4;

	for (int r = 0; r<8 ;r++){
	printf("result -- %f \n",output[r]);
	}
	// outputIndex += 4;
	}

	for(int remain = 0; remain < 2; remain ++){

	}
	}

	#else

	#endif
	}


	- (void)namolData{
	float input = (float )malloc(16 * 3 * 42);
	float output = (float )malloc(16 * 3 * 42);
	int index = 16;
	int outputIndex = 0;
	for (int c = 0; c < index; c ++){
	for (int i = 0 ; i < 126; i++) {
	input[126 * c + i] = (float)c;
	}
	}

	int C = 16;
	int size = 3 * 42;
	float alpha = 1.0f;
	float beta = 1.0f;
	float32x4_t _alpha = vdupq_n_f32(alpha);
	float32x4_t _beta = vdupq_n_f32(beta);

	#if __ARM_NEON
	int nn = size >> 2;
	int remain = size - (nn << 2 );
	float *ptr = input;
	float *outprt = output;

	for(int y = 0; y<3; ++y){

	for (int x = 0; x < nn; ++x){

	float32x4_t v0 = vld1q_f32(ptr);
	v_print(v0, "input");
	float32x4_t mul = vmulq_f32(v0, _alpha);
	v_print(mul, "mul");
	float32x4_t sum = vaddq_f32(mul, _beta);
	v_print(sum, "sum");
	vst1q_f32(outprt + outputIndex, sum);
	ptr +=4;

	for (int r = 0; r<8 ;r++){
	printf("result -- %f \n",outprt[r]);
	}
	outputIndex += 4;
	}


	// for (int x=0; x<10; ++x) {
	// vst1q_f32(output, sum);
	// output ++;
	// }

	for(int i = 40; i<42 ;++i){
	outprt[i] = ptr[i] * alpha +beta;
	}

	for (int r = 0; r<42 ;r++){
	printf("result -- %f \n",outprt[r]);
	}

	printf("width end one loop");
	}
	#else

	#endif
	}

	- (void)neonData{
	float input = (float )malloc(16 * 3 * 42);
	float output = (float )malloc(16 * 3 * 42);
	int index = 16;
	for (int c = 0; c < index; c ++){
	for (int i = 0 ; i < 126; i++) {
	input[126 * c + i] = (float)c;
	}
	}

	#if __ARM_NEON
	int C = 16;
	int size = 3 * 42;
	float alpha = 1.0f;
	float beta = 1.0f;
	int nn = size >> 2;
	int remain = size - (nn << 2 );
	float *ptr = input;
	float *outprt = output;
	float32x4_t _alpha = vdupq_n_f32(alpha);
	float32x4_t _beta = vdupq_n_f32(beta);
	#else
	int remain = size;
	#endif

	#if __ARM_NEON

	#if __aarch64__
	if(nn > 0){
	asm volatile(
	"0: \n"
	"mov v0.4s, %4.4s \n"
	"mov v1.4s, %5.4s \n"
	"prfm pldl1keep, [%1, #64] \n"
	"ld1 {v2.4s}, [%1], #16 \n"
	"fmul v2.4s, v2.4s, v0.4s \n"
	"fadd v3.4s, v2.4s, v1.4s \n"
	"subs %w0, %w0, #1 \n"
	"st1 {v3.4s}, [%3], #16 \n"
	"bne 0b \n"
	:
	"=r"(nn), //%0
	"=r"(ptr) //%1
	:
	"0"(nn),
	"1"(output), // %3
	"w"(_alpha), // %4
	"w"(_beta) // %5
	: "cc","memory","v0","v1","v2","v3"
	);
	}

	#else

	#endif //__aarch64__
	for (int i=0; i<42; i++){
	printf("OUTPUT -- %f",output[i]);
	}
	#endif //__ARM_NEON

	}

	void v_print(float32x4_t vec, std::string x)
	{
	float a[4];
	vst1q_f32((float32_t *)a,vec);
	printf("%s - v0 -- %f -- %f -- %f -- %f \n",x.c_str(),a[0],a[1],a[2],a[3]);

	}

	@end