Skip to content

Instantly share code, notes, and snippets.

@wjchen
Created June 1, 2016 08:37
Show Gist options
  • Select an option

  • Save wjchen/4e99fd23ba1c940466bc7a2837a3ff09 to your computer and use it in GitHub Desktop.

Select an option

Save wjchen/4e99fd23ba1c940466bc7a2837a3ff09 to your computer and use it in GitHub Desktop.
yuyv to bgr
#include <arm_neon.h>
void yuyv2bgr(const unsigned char *yuv, int len, unsigned char *bgr) {
int i, j = 0;
for(i = 0; i < len; i += 4) {
const unsigned char *pyuv = yuv+i;
unsigned char *pbgr = bgr+j;
int r = (22987 * ((pyuv)[3] - 128)) >> 14;
int g = (-5636 * ((pyuv)[1] - 128) - 11698 * ((pyuv)[3] - 128)) >> 14;
int b = (29049 * ((pyuv)[1] - 128)) >> 14;
(pbgr)[0] = (*(pyuv) + b);
(pbgr)[1] = (*(pyuv) + g);
(pbgr)[2] = (*(pyuv) + r);
(pbgr)[3] = ((pyuv)[2] + b);
(pbgr)[4] = ((pyuv)[2] + g);
(pbgr)[5] = ((pyuv)[2] + r);
j+= 6;
}
}
void yuyv2bgr_neon(const unsigned char *yuv, int len, unsigned char *bgr) {
int i, j = 0;
int16x8_t const half = vdupq_n_s16(128);
for(i = 0; i < len; i += 4*8) {
const unsigned char *pyuv = yuv+i;
unsigned char *pbgr = bgr+j;
uint8x8x4_t yuvval = vld4_u8(pyuv);
int16x8_t yuv0 = (int16x8_t)vmovl_u8(yuvval.val[0]);
int16x8_t yuv1 = vsubq_s16((int16x8_t)vmovl_u8(yuvval.val[1]), half);
int16x8_t yuv2 = (int16x8_t)vmovl_u8(yuvval.val[2]);
int16x8_t yuv3 = vsubq_s16((int16x8_t)vmovl_u8(yuvval.val[3]), half);
//(29049 * ((pyuv)[1] - 128)) >> 14
int32x4_t tB0 = vshrq_n_s32(vmull_n_s16(vget_low_s16(yuv1), 29049), 14);
int32x4_t tB1 = vshrq_n_s32(vmull_n_s16(vget_high_s16(yuv1), 29049), 14);
//(-5636 * ((pyuv)[1] - 128) - 11698 * ((pyuv)[3] - 128)) >> 14;
int32x4_t tG0 = vshrq_n_s32(vaddq_s32(vmull_n_s16(vget_low_s16(yuv1), -5636), vmull_n_s16(vget_low_s16(yuv3), -11698)), 14);
int32x4_t tG1 = vshrq_n_s32(vaddq_s32(vmull_n_s16(vget_high_s16(yuv1), -5636), vmull_n_s16(vget_high_s16(yuv3), -11698)), 14);
//(22987 * ((pyuv)[3] - 128)) >> 14;
int32x4_t tR0 = vshrq_n_s32(vmull_n_s16(vget_low_s16(yuv3), 22987), 14);
int32x4_t tR1 = vshrq_n_s32(vmull_n_s16(vget_high_s16(yuv3), 22987), 14);
int32x4_t yuv00 = vmovl_s16(vget_low_s16(yuv0));
int32x4_t yuv01 = vmovl_s16(vget_high_s16(yuv0));
int32x4_t yuv20 = vmovl_s16(vget_low_s16(yuv2));
int32x4_t yuv21 = vmovl_s16(vget_high_s16(yuv2));
uint8x8_t r0 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv00, tB0)), vqmovun_s32(vaddq_s32(yuv01, tB1))));
uint8x8_t r1 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv00, tG0)), vqmovun_s32(vaddq_s32(yuv01, tG1))));
uint8x8_t r2 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv00, tR0)), vqmovun_s32(vaddq_s32(yuv01, tR1))));
uint8x8_t r3 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv20, tB0)), vqmovun_s32(vaddq_s32(yuv21, tB1))));
uint8x8_t r4 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv20, tG0)), vqmovun_s32(vaddq_s32(yuv21, tG1))));
uint8x8_t r5 = vmovn_u16(vcombine_u16(vqmovun_s32(vaddq_s32(yuv20, tR0)), vqmovun_s32(vaddq_s32(yuv21, tR1))));
uint16x8x3_t pblock;
pblock.val[0] = vaddw_u8(vshll_n_u8(r1, 8), r0);
pblock.val[1] = vaddw_u8(vshll_n_u8(r3, 8), r2);
pblock.val[2] = vaddw_u8(vshll_n_u8(r5, 8), r4);
vst3q_u16((uint16_t *)pbgr, pblock);
j += 6*8;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment