Created
August 16, 2019 09:37
-
-
Save AllanChen/38be9e28dc8e305853a6c6ca4755a9ef to your computer and use it in GitHub Desktop.
Neon Test demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import "AppDelegate.h" | |
#include <stdio.h> | |
@interface AppDelegate () | |
@end | |
@implementation AppDelegate | |
- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { | |
// [self namolData]; | |
// [self test]; | |
// [self test2]; | |
// [self neonTest]; | |
// [self test_cpp]; | |
// [self neonPractice]; | |
[self neonPractice2]; | |
// [self test_2]; | |
return YES; | |
} | |
- (void)test_2{ | |
int channel = 8; | |
int width = 8; | |
int height = 8; | |
int value = width * height; | |
float *input_value_f = new float[value]; | |
for(int i=0; i<value; i++){ | |
input_value_f[i] = (float)i; | |
} | |
VenusCPU::Mat inMat = VenusCPU::Mat::from_float_array(input_value_f, width, height, 4); | |
VenusCPU::Mat outMat; | |
//#if __ARM_NEON | |
//#endif | |
// for (int c=0; c<inMat.shape.c; c++) { | |
// float * p0 = inMat.channel(c); | |
// for (int h=0; h<inMat.shape.h; h++){ | |
// for(int w=0; w<inMat.shape.w; w++){ | |
// float v0 = *(p0+ h * inMat.shape.w + w); | |
// LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0); | |
// } | |
// } | |
// } | |
// mat_chwtoc8hw8_int32(inMat,outMat); | |
// mat_chwtoc4hw4_int32(inMat, outMat); | |
for (int c=0; c< outMat.shape.c; c++) { | |
float * p0 = outMat.channel(c); | |
for (int h=0; h<outMat.shape.h; h++){ | |
for(int w=0; w<outMat.shape.w; w++){ | |
float v0 = *(p0+ h * outMat.shape.w + w); | |
LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0); | |
} | |
} | |
} | |
} | |
void mat_chwtoc4hw4_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){ | |
const int c = inMat.shape.c; | |
const int h = inMat.shape.h; | |
const int w = inMat.shape.w; | |
outMat.create(VenusCPU::TensorShape(c / 4, h, w * 4, VenusCPU::TensorShape::Layout::C4HW4, sizeof(int), inMat.shape.step_size)); | |
if (outMat.empty()) | |
assert(false && "creat mat failed!"); | |
const int C = inMat.shape.c; | |
const int H = inMat.shape.h; | |
const int W = inMat.shape.w; | |
for (int z = 0; z < C / 4; z++) { | |
int * input0 = inMat.channel(z * 4 + 0); | |
int * input1 = inMat.channel(z * 4 + 1); | |
int * input2 = inMat.channel(z * 4 + 2); | |
int * input3 = inMat.channel(z * 4 + 3); | |
int * output0 = outMat.channel(z); | |
for (int i = 0; i < H * W; i++) { | |
output0[i*4 + 0] = input0[i]; | |
output0[i*4 + 1] = input1[i]; | |
output0[i*4 + 2] = input2[i]; | |
output0[i*4 + 3] = input3[i]; | |
} | |
} | |
} | |
void mat_chwtoc8hw8_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){ | |
const int c = inMat.shape.c; | |
const int h = inMat.shape.h; | |
const int w = inMat.shape.w; | |
outMat.create(VenusCPU::TensorShape(c / 8, h, w * 8, VenusCPU::TensorShape::Layout::C8HW8, sizeof(int), inMat.shape.step_size)); | |
if (outMat.empty()) | |
assert(false && "creat mat failed!"); | |
const int C = inMat.shape.c; | |
const int H = inMat.shape.h; | |
const int W = inMat.shape.w; | |
for (int z = 0; z < C / 8; z++) { | |
int * input0 = inMat.channel(z * 8 + 0); | |
int * input1 = inMat.channel(z * 8 + 1); | |
int * input2 = inMat.channel(z * 8 + 2); | |
int * input3 = inMat.channel(z * 8 + 3); | |
int * input4 = inMat.channel(z * 8 + 4); | |
int * input5 = inMat.channel(z * 8 + 5); | |
int * input6 = inMat.channel(z * 8 + 6); | |
int * input7 = inMat.channel(z * 8 + 7); | |
int * output0 = outMat.channel(z); | |
for (int i = 0; i < H * W; i++) { | |
output0[i*8 + 0] = input0[i]; | |
output0[i*8 + 1] = input1[i]; | |
output0[i*8 + 2] = input2[i]; | |
output0[i*8 + 3] = input3[i]; | |
output0[i*8 + 4] = input4[i]; | |
output0[i*8 + 5] = input5[i]; | |
output0[i*8 + 6] = input6[i]; | |
output0[i*8 + 7] = input7[i]; | |
} | |
} | |
} | |
- (void)test_cpp{ | |
double tic = CACurrentMediaTime(); | |
int m_total = 1024 * 1024 *100; | |
// float *input = (float *)malloc(m_total); | |
// float *output = (float *)malloc(m_total); | |
float *input = new float[m_total]; | |
float *output = new float[m_total]; | |
for(int i = 0; i< m_total; i++){ | |
input[i] = (float)100; | |
} | |
for(int i=0; i<m_total;i++){ | |
output[i] = input[i] * 2 +3; | |
} | |
printf("cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic)); | |
} | |
- (void)neonPractice2{ | |
int value = 10 * 10; | |
int *a = new int[value]; | |
float *input_value_f = new float[value]; | |
for(int i=0; i<100; i++){ | |
input_value_f[i] = (int)i; | |
// a[i]=0; | |
} | |
int16_t *output_value_f = reinterpret_cast<int16_t*>(a); | |
int nn = value >> 2; | |
float alpha = 1.0f; | |
#if __ARM_NEON | |
const float *inputptr = input_value_f; | |
int16_t *outputptr = output_value_f; | |
for (int j = 0; j < nn; ++j) { | |
float32x4_t _alpha = vdupq_n_f32(alpha); | |
int16x4x4_t output_i16quant; | |
int16x4x4_t output_i16quant2; | |
float32x4_t input0_f32 = vld1q_f32(inputptr); | |
inputptr += 4; | |
float32x4_t input0_f32mulstep = vmulq_f32(input0_f32, _alpha); | |
int32x4_t input0_i32mulstep = vcvtq_s32_f32(input0_f32mulstep); | |
output_i16quant.val[0] = vmovn_s32(input0_i32mulstep); | |
float32x4_t input1_f32 = vld1q_f32(inputptr); | |
inputptr += 4; | |
float32x4_t input1_f32mulstep = vmulq_f32(input1_f32, _alpha); | |
int32x4_t input1_i32mulstep = vcvtq_s32_f32(input1_f32mulstep); | |
output_i16quant.val[1] = vmovn_s32(input1_i32mulstep); | |
float32x4_t input2_f32 = vld1q_f32(inputptr); | |
inputptr += 4; | |
float32x4_t input2_f32mulstep = vmulq_f32(input2_f32, _alpha); | |
int32x4_t input2_i32mulstep = vcvtq_s32_f32(input2_f32mulstep); | |
output_i16quant.val[2] = vmovn_s32(input2_i32mulstep); | |
float32x4_t input3_f32 = vld1q_f32(inputptr); | |
inputptr += 4; | |
float32x4_t input3_f32mulstep = vmulq_f32(input3_f32, _alpha); | |
int32x4_t input3_i32mulstep = vcvtq_s32_f32(input3_f32mulstep); | |
output_i16quant.val[3] = vmovn_s32(input3_i32mulstep); | |
// float32x4_t input4_f32 = vld1q_f32(inputptr); | |
// inputptr += 4; | |
// float32x4_t input4_f32mulstep = vmulq_f32(input4_f32, _alpha); | |
// int32x4_t input4_i32mulstep = vcvtq_s32_f32(input4_f32mulstep); | |
// output_i16quant2.val[0] = vmovn_s32(input4_i32mulstep); | |
// | |
// float32x4_t input5_f32 = vld1q_f32(inputptr); | |
// inputptr += 4; | |
// float32x4_t input5_f32mulstep = vmulq_f32(input5_f32, _alpha); | |
// int32x4_t input5_i32mulstep = vcvtq_s32_f32(input5_f32mulstep); | |
// output_i16quant2.val[1] = vmovn_s32(input5_i32mulstep); | |
// | |
// float32x4_t input6_f32 = vld1q_f32(inputptr); | |
// inputptr += 4; | |
// float32x4_t input6_f32mulstep = vmulq_f32(input6_f32, _alpha); | |
// int32x4_t input6_i32mulstep = vcvtq_s32_f32(input6_f32mulstep); | |
// output_i16quant2.val[2] = vmovn_s32(input6_i32mulstep); | |
// | |
// float32x4_t input7_f32 = vld1q_f32(inputptr); | |
// inputptr += 4; | |
// float32x4_t input7_f32mulstep = vmulq_f32(input7_f32, _alpha); | |
// int32x4_t input7_i32mulstep = vcvtq_s32_f32(input7_f32mulstep); | |
// output_i16quant2.val[3] = vmovn_s32(input7_i32mulstep); | |
// vtrn1_s16(output_i16quant2.val[0], output_i16quant.val[0]); | |
// vtrn1_s16(output_i16quant2.val[2], output_i16quant.val[2]); | |
// vtrn1_s16(output_i16quant2.val[3], output_i16quant.val[3]); | |
vst4_s16(outputptr, output_i16quant); | |
// vst4_s16(outputptr, output_i16quant2); | |
outputptr +=16; | |
} | |
#endif | |
for(int i=0; i< 100; i++){ | |
// printf("neon --output %d \n",(int)output_value_f[i]); | |
// printf("neon -- %" PRIu16 "\n",output_value_f[i]); | |
printf("%hu\n",output_value_f[i]); | |
} | |
} | |
- (void)neonPractice{ | |
int value = 10 * 10; | |
int *input_value_f = new int[value]; | |
int *output_value_f = new int[value]; | |
for(int i=0; i<100; i++){ | |
input_value_f[i] = (int)i+100; | |
} | |
int nn = value >> 2; | |
int alpha = (int)1; | |
#if __ARM_NEON | |
const int *inputptr = input_value_f; | |
int *outputptr = output_value_f; | |
int16x4_t _alpha = vdup_n_s16(alpha); | |
#if __aarch64__ | |
if(nn > 0){ | |
asm volatile( | |
#ifdef USE_PRFM | |
"prfm pldl1keep, [%2, #128] \n" | |
#endif | |
"0: \n" | |
// "prfm pldl1keep, [%4, #128] \n" | |
"ld1 {v0.8h, v1.8h},[%4], #32 \n" | |
// "ld1 {v0.8h, v1.8h}, [%2], #32 \n" | |
"smull v2.4s, %5.4h, v0.h[4] \n" | |
// "smull v2.4s, %5.4h, v1.4h \n" | |
// "fadd v2.4s, v0.4s, v1.4s \n" | |
"mov v2.4s, v0.4s \n" | |
"subs %w0, %w0, #1 \n" | |
"st1 {v2.4s},[%1], #16 \n" | |
"bne 0b" | |
: | |
"=r"(nn), | |
"=r"(outputptr) | |
: | |
"0"(nn), | |
"1"(outputptr), | |
"r"(inputptr), //%4 | |
"w"(_alpha) // %5 | |
:"cc","memory","v0","v1","v2" | |
); | |
} | |
#else | |
#endif | |
#endif | |
for(int i=0; i< 100/2; i++){ | |
printf("neon --output %d",outputptr[i]); | |
} | |
} | |
- (void)neonTest{ | |
double tic = CACurrentMediaTime(); | |
int total = 1024 * 1024 * 100; | |
#if __ARM_NEON | |
float *input = new float[total]; | |
float *output = new float[total]; | |
float alpha = 1.0f; | |
float beta = 2.0f; | |
int nn = total >> 2; | |
for(int i = 0; i<total; i++){ | |
input[i] = (float)100; | |
} | |
float32x4_t _alpha = vdupq_n_f32(alpha); | |
float32x4_t _beta = vdupq_n_f32(beta); | |
#else | |
#endif | |
#if __ARM_NEON | |
const float *inputptr = input; | |
float *outputptr = input; | |
#if __aarch64__ | |
// for(int i=0; i<nn; i++){ | |
// printf("INPUT ----%f",inputptr[i]); | |
// } | |
if(nn > 0){ | |
asm volatile( | |
"0: \n" | |
"mov v0.4s, %5.4s \n" | |
"mov v1.4s, %6.4s \n" | |
"prfm pldl1keep, [%4, #64] \n" | |
"ld1 {v2.4s}, [%4], #16 \n" | |
"fmul v3.4s, v2.4s, v0.4s \n" | |
"fadd v4.4s, v3.4s, v1.4s \n" | |
"subs %w0, %w0, #1 \n" | |
"st1 {v4.4s}, [%1], #16 \n" | |
"bne 0b" | |
: | |
"=r"(nn), //%0 | |
"=r"(outputptr)//%1 | |
: | |
"0"(nn), | |
"1"(outputptr), | |
"r"(inputptr), //%4 | |
"w"(_alpha), //%5 | |
"w"(_beta) //%6 | |
:"cc","memory","v0","v1","v2","v3","v4" | |
); | |
} | |
#else | |
#endif// __aarch64 | |
for (int i=0; i<2; i++){ | |
int zz = total >> 2; | |
outputptr[i] = inputptr [i + zz * 4] *alpha +beta; | |
} | |
#endif// __ARM_NEON | |
printf("neon -- cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic)); | |
} | |
- (void)test{ | |
float *input = (float *)malloc(3 * 42); | |
float *output = (float *)malloc(3 *42); | |
int size = 3 * 42; | |
float alpha = 1.0f; | |
float beta = 1.0f; | |
float32x4_t _alpha = vdupq_n_f32(alpha); | |
float32x4_t _beta = vdupq_n_f32(beta); | |
for (int c = 0; c < 1; c ++){ | |
for (int i = 0 ; i < 126; i++) { | |
input[126 * c + i] = (float)0; | |
} | |
} | |
#if __ARM_NEON | |
float *outptr = output; | |
for(int y=0; y< 3; y++){ | |
for(int x=0; x<42; x++){ | |
float32x4_t v0 = vld1q_f32(input); | |
v_print(v0, "input"); | |
float32x4_t mul = vmulq_f32(v0, _alpha); | |
v_print(mul, "mul"); | |
float32x4_t sum = vaddq_f32(mul, _beta); | |
v_print(sum, "sum"); | |
vst1q_f32(outptr, sum); | |
input +=4; | |
outptr +=4; | |
for (int r = 0; r<8 ;r++){ | |
printf("result -- %f \n",output[r]); | |
} | |
// outputIndex += 4; | |
} | |
for(int remain = 0; remain < 2; remain ++){ | |
} | |
} | |
#else | |
#endif | |
} | |
- (void)namolData{ | |
float *input = (float *)malloc(16 * 3 * 42); | |
float *output = (float *)malloc(16 * 3 * 42); | |
int index = 16; | |
int outputIndex = 0; | |
for (int c = 0; c < index; c ++){ | |
for (int i = 0 ; i < 126; i++) { | |
input[126 * c + i] = (float)c; | |
} | |
} | |
int C = 16; | |
int size = 3 * 42; | |
float alpha = 1.0f; | |
float beta = 1.0f; | |
float32x4_t _alpha = vdupq_n_f32(alpha); | |
float32x4_t _beta = vdupq_n_f32(beta); | |
#if __ARM_NEON | |
int nn = size >> 2; | |
int remain = size - (nn << 2 ); | |
float *ptr = input; | |
float *outprt = output; | |
for(int y = 0; y<3; ++y){ | |
for (int x = 0; x < nn; ++x){ | |
float32x4_t v0 = vld1q_f32(ptr); | |
v_print(v0, "input"); | |
float32x4_t mul = vmulq_f32(v0, _alpha); | |
v_print(mul, "mul"); | |
float32x4_t sum = vaddq_f32(mul, _beta); | |
v_print(sum, "sum"); | |
vst1q_f32(outprt + outputIndex, sum); | |
ptr +=4; | |
for (int r = 0; r<8 ;r++){ | |
printf("result -- %f \n",outprt[r]); | |
} | |
outputIndex += 4; | |
} | |
// for (int x=0; x<10; ++x) { | |
// vst1q_f32(output, sum); | |
// output ++; | |
// } | |
for(int i = 40; i<42 ;++i){ | |
outprt[i] = ptr[i] * alpha +beta; | |
} | |
for (int r = 0; r<42 ;r++){ | |
printf("result -- %f \n",outprt[r]); | |
} | |
printf("width end one loop"); | |
} | |
#else | |
#endif | |
} | |
- (void)neonData{ | |
float *input = (float *)malloc(16 * 3 * 42); | |
float *output = (float *)malloc(16 * 3 * 42); | |
int index = 16; | |
for (int c = 0; c < index; c ++){ | |
for (int i = 0 ; i < 126; i++) { | |
input[126 * c + i] = (float)c; | |
} | |
} | |
#if __ARM_NEON | |
int C = 16; | |
int size = 3 * 42; | |
float alpha = 1.0f; | |
float beta = 1.0f; | |
int nn = size >> 2; | |
int remain = size - (nn << 2 ); | |
float *ptr = input; | |
float *outprt = output; | |
float32x4_t _alpha = vdupq_n_f32(alpha); | |
float32x4_t _beta = vdupq_n_f32(beta); | |
#else | |
int remain = size; | |
#endif | |
#if __ARM_NEON | |
#if __aarch64__ | |
if(nn > 0){ | |
asm volatile( | |
"0: \n" | |
"mov v0.4s, %4.4s \n" | |
"mov v1.4s, %5.4s \n" | |
"prfm pldl1keep, [%1, #64] \n" | |
"ld1 {v2.4s}, [%1], #16 \n" | |
"fmul v2.4s, v2.4s, v0.4s \n" | |
"fadd v3.4s, v2.4s, v1.4s \n" | |
"subs %w0, %w0, #1 \n" | |
"st1 {v3.4s}, [%3], #16 \n" | |
"bne 0b \n" | |
: | |
"=r"(nn), //%0 | |
"=r"(ptr) //%1 | |
: | |
"0"(nn), | |
"1"(output), // %3 | |
"w"(_alpha), // %4 | |
"w"(_beta) // %5 | |
: "cc","memory","v0","v1","v2","v3" | |
); | |
} | |
#else | |
#endif //__aarch64__ | |
for (int i=0; i<42; i++){ | |
printf("OUTPUT -- %f",output[i]); | |
} | |
#endif //__ARM_NEON | |
} | |
void v_print(float32x4_t vec, std::string x) | |
{ | |
float a[4]; | |
vst1q_f32((float32_t *)a,vec); | |
printf("%s - v0 -- %f -- %f -- %f -- %f \n",x.c_str(),a[0],a[1],a[2],a[3]); | |
} | |
@end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment