Skip to content

Instantly share code, notes, and snippets.

@AllanChen
Created August 16, 2019 09:37
Show Gist options
  • Save AllanChen/38be9e28dc8e305853a6c6ca4755a9ef to your computer and use it in GitHub Desktop.
Save AllanChen/38be9e28dc8e305853a6c6ca4755a9ef to your computer and use it in GitHub Desktop.
Neon Test demo
#import "AppDelegate.h"
#include <stdio.h>
@interface AppDelegate ()
@end
@implementation AppDelegate
- (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
// [self namolData];
// [self test];
// [self test2];
// [self neonTest];
// [self test_cpp];
// [self neonPractice];
[self neonPractice2];
// [self test_2];
return YES;
}
- (void)test_2{
int channel = 8;
int width = 8;
int height = 8;
int value = width * height;
float *input_value_f = new float[value];
for(int i=0; i<value; i++){
input_value_f[i] = (float)i;
}
VenusCPU::Mat inMat = VenusCPU::Mat::from_float_array(input_value_f, width, height, 4);
VenusCPU::Mat outMat;
//#if __ARM_NEON
//#endif
// for (int c=0; c<inMat.shape.c; c++) {
// float * p0 = inMat.channel(c);
// for (int h=0; h<inMat.shape.h; h++){
// for(int w=0; w<inMat.shape.w; w++){
// float v0 = *(p0+ h * inMat.shape.w + w);
// LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0);
// }
// }
// }
// mat_chwtoc8hw8_int32(inMat,outMat);
// mat_chwtoc4hw4_int32(inMat, outMat);
for (int c=0; c< outMat.shape.c; c++) {
float * p0 = outMat.channel(c);
for (int h=0; h<outMat.shape.h; h++){
for(int w=0; w<outMat.shape.w; w++){
float v0 = *(p0+ h * outMat.shape.w + w);
LOGV("c %d h %d w %d -- value -- %f", c, h, w, v0);
}
}
}
}
void mat_chwtoc4hw4_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
const int c = inMat.shape.c;
const int h = inMat.shape.h;
const int w = inMat.shape.w;
outMat.create(VenusCPU::TensorShape(c / 4, h, w * 4, VenusCPU::TensorShape::Layout::C4HW4, sizeof(int), inMat.shape.step_size));
if (outMat.empty())
assert(false && "creat mat failed!");
const int C = inMat.shape.c;
const int H = inMat.shape.h;
const int W = inMat.shape.w;
for (int z = 0; z < C / 4; z++) {
int * input0 = inMat.channel(z * 4 + 0);
int * input1 = inMat.channel(z * 4 + 1);
int * input2 = inMat.channel(z * 4 + 2);
int * input3 = inMat.channel(z * 4 + 3);
int * output0 = outMat.channel(z);
for (int i = 0; i < H * W; i++) {
output0[i*4 + 0] = input0[i];
output0[i*4 + 1] = input1[i];
output0[i*4 + 2] = input2[i];
output0[i*4 + 3] = input3[i];
}
}
}
void mat_chwtoc8hw8_int32(VenusCPU::Mat &inMat, VenusCPU::Mat &outMat){
const int c = inMat.shape.c;
const int h = inMat.shape.h;
const int w = inMat.shape.w;
outMat.create(VenusCPU::TensorShape(c / 8, h, w * 8, VenusCPU::TensorShape::Layout::C8HW8, sizeof(int), inMat.shape.step_size));
if (outMat.empty())
assert(false && "creat mat failed!");
const int C = inMat.shape.c;
const int H = inMat.shape.h;
const int W = inMat.shape.w;
for (int z = 0; z < C / 8; z++) {
int * input0 = inMat.channel(z * 8 + 0);
int * input1 = inMat.channel(z * 8 + 1);
int * input2 = inMat.channel(z * 8 + 2);
int * input3 = inMat.channel(z * 8 + 3);
int * input4 = inMat.channel(z * 8 + 4);
int * input5 = inMat.channel(z * 8 + 5);
int * input6 = inMat.channel(z * 8 + 6);
int * input7 = inMat.channel(z * 8 + 7);
int * output0 = outMat.channel(z);
for (int i = 0; i < H * W; i++) {
output0[i*8 + 0] = input0[i];
output0[i*8 + 1] = input1[i];
output0[i*8 + 2] = input2[i];
output0[i*8 + 3] = input3[i];
output0[i*8 + 4] = input4[i];
output0[i*8 + 5] = input5[i];
output0[i*8 + 6] = input6[i];
output0[i*8 + 7] = input7[i];
}
}
}
- (void)test_cpp{
double tic = CACurrentMediaTime();
int m_total = 1024 * 1024 *100;
// float *input = (float *)malloc(m_total);
// float *output = (float *)malloc(m_total);
float *input = new float[m_total];
float *output = new float[m_total];
for(int i = 0; i< m_total; i++){
input[i] = (float)100;
}
for(int i=0; i<m_total;i++){
output[i] = input[i] * 2 +3;
}
printf("cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
}
- (void)neonPractice2{
int value = 10 * 10;
int *a = new int[value];
float *input_value_f = new float[value];
for(int i=0; i<100; i++){
input_value_f[i] = (int)i;
// a[i]=0;
}
int16_t *output_value_f = reinterpret_cast<int16_t*>(a);
int nn = value >> 2;
float alpha = 1.0f;
#if __ARM_NEON
const float *inputptr = input_value_f;
int16_t *outputptr = output_value_f;
for (int j = 0; j < nn; ++j) {
float32x4_t _alpha = vdupq_n_f32(alpha);
int16x4x4_t output_i16quant;
int16x4x4_t output_i16quant2;
float32x4_t input0_f32 = vld1q_f32(inputptr);
inputptr += 4;
float32x4_t input0_f32mulstep = vmulq_f32(input0_f32, _alpha);
int32x4_t input0_i32mulstep = vcvtq_s32_f32(input0_f32mulstep);
output_i16quant.val[0] = vmovn_s32(input0_i32mulstep);
float32x4_t input1_f32 = vld1q_f32(inputptr);
inputptr += 4;
float32x4_t input1_f32mulstep = vmulq_f32(input1_f32, _alpha);
int32x4_t input1_i32mulstep = vcvtq_s32_f32(input1_f32mulstep);
output_i16quant.val[1] = vmovn_s32(input1_i32mulstep);
float32x4_t input2_f32 = vld1q_f32(inputptr);
inputptr += 4;
float32x4_t input2_f32mulstep = vmulq_f32(input2_f32, _alpha);
int32x4_t input2_i32mulstep = vcvtq_s32_f32(input2_f32mulstep);
output_i16quant.val[2] = vmovn_s32(input2_i32mulstep);
float32x4_t input3_f32 = vld1q_f32(inputptr);
inputptr += 4;
float32x4_t input3_f32mulstep = vmulq_f32(input3_f32, _alpha);
int32x4_t input3_i32mulstep = vcvtq_s32_f32(input3_f32mulstep);
output_i16quant.val[3] = vmovn_s32(input3_i32mulstep);
// float32x4_t input4_f32 = vld1q_f32(inputptr);
// inputptr += 4;
// float32x4_t input4_f32mulstep = vmulq_f32(input4_f32, _alpha);
// int32x4_t input4_i32mulstep = vcvtq_s32_f32(input4_f32mulstep);
// output_i16quant2.val[0] = vmovn_s32(input4_i32mulstep);
//
// float32x4_t input5_f32 = vld1q_f32(inputptr);
// inputptr += 4;
// float32x4_t input5_f32mulstep = vmulq_f32(input5_f32, _alpha);
// int32x4_t input5_i32mulstep = vcvtq_s32_f32(input5_f32mulstep);
// output_i16quant2.val[1] = vmovn_s32(input5_i32mulstep);
//
// float32x4_t input6_f32 = vld1q_f32(inputptr);
// inputptr += 4;
// float32x4_t input6_f32mulstep = vmulq_f32(input6_f32, _alpha);
// int32x4_t input6_i32mulstep = vcvtq_s32_f32(input6_f32mulstep);
// output_i16quant2.val[2] = vmovn_s32(input6_i32mulstep);
//
// float32x4_t input7_f32 = vld1q_f32(inputptr);
// inputptr += 4;
// float32x4_t input7_f32mulstep = vmulq_f32(input7_f32, _alpha);
// int32x4_t input7_i32mulstep = vcvtq_s32_f32(input7_f32mulstep);
// output_i16quant2.val[3] = vmovn_s32(input7_i32mulstep);
// vtrn1_s16(output_i16quant2.val[0], output_i16quant.val[0]);
// vtrn1_s16(output_i16quant2.val[2], output_i16quant.val[2]);
// vtrn1_s16(output_i16quant2.val[3], output_i16quant.val[3]);
vst4_s16(outputptr, output_i16quant);
// vst4_s16(outputptr, output_i16quant2);
outputptr +=16;
}
#endif
for(int i=0; i< 100; i++){
// printf("neon --output %d \n",(int)output_value_f[i]);
// printf("neon -- %" PRIu16 "\n",output_value_f[i]);
printf("%hu\n",output_value_f[i]);
}
}
- (void)neonPractice{
int value = 10 * 10;
int *input_value_f = new int[value];
int *output_value_f = new int[value];
for(int i=0; i<100; i++){
input_value_f[i] = (int)i+100;
}
int nn = value >> 2;
int alpha = (int)1;
#if __ARM_NEON
const int *inputptr = input_value_f;
int *outputptr = output_value_f;
int16x4_t _alpha = vdup_n_s16(alpha);
#if __aarch64__
if(nn > 0){
asm volatile(
#ifdef USE_PRFM
"prfm pldl1keep, [%2, #128] \n"
#endif
"0: \n"
// "prfm pldl1keep, [%4, #128] \n"
"ld1 {v0.8h, v1.8h},[%4], #32 \n"
// "ld1 {v0.8h, v1.8h}, [%2], #32 \n"
"smull v2.4s, %5.4h, v0.h[4] \n"
// "smull v2.4s, %5.4h, v1.4h \n"
// "fadd v2.4s, v0.4s, v1.4s \n"
"mov v2.4s, v0.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v2.4s},[%1], #16 \n"
"bne 0b"
:
"=r"(nn),
"=r"(outputptr)
:
"0"(nn),
"1"(outputptr),
"r"(inputptr), //%4
"w"(_alpha) // %5
:"cc","memory","v0","v1","v2"
);
}
#else
#endif
#endif
for(int i=0; i< 100/2; i++){
printf("neon --output %d",outputptr[i]);
}
}
- (void)neonTest{
double tic = CACurrentMediaTime();
int total = 1024 * 1024 * 100;
#if __ARM_NEON
float *input = new float[total];
float *output = new float[total];
float alpha = 1.0f;
float beta = 2.0f;
int nn = total >> 2;
for(int i = 0; i<total; i++){
input[i] = (float)100;
}
float32x4_t _alpha = vdupq_n_f32(alpha);
float32x4_t _beta = vdupq_n_f32(beta);
#else
#endif
#if __ARM_NEON
const float *inputptr = input;
float *outputptr = input;
#if __aarch64__
// for(int i=0; i<nn; i++){
// printf("INPUT ----%f",inputptr[i]);
// }
if(nn > 0){
asm volatile(
"0: \n"
"mov v0.4s, %5.4s \n"
"mov v1.4s, %6.4s \n"
"prfm pldl1keep, [%4, #64] \n"
"ld1 {v2.4s}, [%4], #16 \n"
"fmul v3.4s, v2.4s, v0.4s \n"
"fadd v4.4s, v3.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v4.4s}, [%1], #16 \n"
"bne 0b"
:
"=r"(nn), //%0
"=r"(outputptr)//%1
:
"0"(nn),
"1"(outputptr),
"r"(inputptr), //%4
"w"(_alpha), //%5
"w"(_beta) //%6
:"cc","memory","v0","v1","v2","v3","v4"
);
}
#else
#endif// __aarch64
for (int i=0; i<2; i++){
int zz = total >> 2;
outputptr[i] = inputptr [i + zz * 4] *alpha +beta;
}
#endif// __ARM_NEON
printf("neon -- cost %.4f ms\n", 1000 * (CACurrentMediaTime() - tic));
}
- (void)test{
float *input = (float *)malloc(3 * 42);
float *output = (float *)malloc(3 *42);
int size = 3 * 42;
float alpha = 1.0f;
float beta = 1.0f;
float32x4_t _alpha = vdupq_n_f32(alpha);
float32x4_t _beta = vdupq_n_f32(beta);
for (int c = 0; c < 1; c ++){
for (int i = 0 ; i < 126; i++) {
input[126 * c + i] = (float)0;
}
}
#if __ARM_NEON
float *outptr = output;
for(int y=0; y< 3; y++){
for(int x=0; x<42; x++){
float32x4_t v0 = vld1q_f32(input);
v_print(v0, "input");
float32x4_t mul = vmulq_f32(v0, _alpha);
v_print(mul, "mul");
float32x4_t sum = vaddq_f32(mul, _beta);
v_print(sum, "sum");
vst1q_f32(outptr, sum);
input +=4;
outptr +=4;
for (int r = 0; r<8 ;r++){
printf("result -- %f \n",output[r]);
}
// outputIndex += 4;
}
for(int remain = 0; remain < 2; remain ++){
}
}
#else
#endif
}
- (void)namolData{
float *input = (float *)malloc(16 * 3 * 42);
float *output = (float *)malloc(16 * 3 * 42);
int index = 16;
int outputIndex = 0;
for (int c = 0; c < index; c ++){
for (int i = 0 ; i < 126; i++) {
input[126 * c + i] = (float)c;
}
}
int C = 16;
int size = 3 * 42;
float alpha = 1.0f;
float beta = 1.0f;
float32x4_t _alpha = vdupq_n_f32(alpha);
float32x4_t _beta = vdupq_n_f32(beta);
#if __ARM_NEON
int nn = size >> 2;
int remain = size - (nn << 2 );
float *ptr = input;
float *outprt = output;
for(int y = 0; y<3; ++y){
for (int x = 0; x < nn; ++x){
float32x4_t v0 = vld1q_f32(ptr);
v_print(v0, "input");
float32x4_t mul = vmulq_f32(v0, _alpha);
v_print(mul, "mul");
float32x4_t sum = vaddq_f32(mul, _beta);
v_print(sum, "sum");
vst1q_f32(outprt + outputIndex, sum);
ptr +=4;
for (int r = 0; r<8 ;r++){
printf("result -- %f \n",outprt[r]);
}
outputIndex += 4;
}
// for (int x=0; x<10; ++x) {
// vst1q_f32(output, sum);
// output ++;
// }
for(int i = 40; i<42 ;++i){
outprt[i] = ptr[i] * alpha +beta;
}
for (int r = 0; r<42 ;r++){
printf("result -- %f \n",outprt[r]);
}
printf("width end one loop");
}
#else
#endif
}
- (void)neonData{
float *input = (float *)malloc(16 * 3 * 42);
float *output = (float *)malloc(16 * 3 * 42);
int index = 16;
for (int c = 0; c < index; c ++){
for (int i = 0 ; i < 126; i++) {
input[126 * c + i] = (float)c;
}
}
#if __ARM_NEON
int C = 16;
int size = 3 * 42;
float alpha = 1.0f;
float beta = 1.0f;
int nn = size >> 2;
int remain = size - (nn << 2 );
float *ptr = input;
float *outprt = output;
float32x4_t _alpha = vdupq_n_f32(alpha);
float32x4_t _beta = vdupq_n_f32(beta);
#else
int remain = size;
#endif
#if __ARM_NEON
#if __aarch64__
if(nn > 0){
asm volatile(
"0: \n"
"mov v0.4s, %4.4s \n"
"mov v1.4s, %5.4s \n"
"prfm pldl1keep, [%1, #64] \n"
"ld1 {v2.4s}, [%1], #16 \n"
"fmul v2.4s, v2.4s, v0.4s \n"
"fadd v3.4s, v2.4s, v1.4s \n"
"subs %w0, %w0, #1 \n"
"st1 {v3.4s}, [%3], #16 \n"
"bne 0b \n"
:
"=r"(nn), //%0
"=r"(ptr) //%1
:
"0"(nn),
"1"(output), // %3
"w"(_alpha), // %4
"w"(_beta) // %5
: "cc","memory","v0","v1","v2","v3"
);
}
#else
#endif //__aarch64__
for (int i=0; i<42; i++){
printf("OUTPUT -- %f",output[i]);
}
#endif //__ARM_NEON
}
void v_print(float32x4_t vec, std::string x)
{
float a[4];
vst1q_f32((float32_t *)a,vec);
printf("%s - v0 -- %f -- %f -- %f -- %f \n",x.c_str(),a[0],a[1],a[2],a[3]);
}
@end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment