Skip to content

Instantly share code, notes, and snippets.

@tamarous
Created February 8, 2017 06:52
Show Gist options
  • Save tamarous/550e2bec07fbef79e8808ab4b238b2ff to your computer and use it in GitHub Desktop.
Save tamarous/550e2bec07fbef79e8808ab4b238b2ff to your computer and use it in GitHub Desktop.
使用快速指令集来将YUV转成32位RGB
void yuv420_to_argb8888(uint8_t *pYUV,
uint32_t sy, uint32_t suv,
int width, int height,
uint32_t *rgb, uint32_t srgb) {
uint8_t *yp;
uint8_t *up;
uint8_t *vp;
yp = pYUV;
up = pYUV + width*height;
vp = up + width*height / 4;
__m128i y0r0, y0r1, u0, v0;
__m128i y00r0, y01r0, y00r1, y01r1;
__m128i u00, u01, v00, v01;
__m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
__m128i r00, r01, g00, g01, b00, b01;
__m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
__m128i gbgb;
__m128i ysub, uvsub;
__m128i zero, facy, facrv, facgu, facgv, facbu;
__m128i *srcy128r0, *srcy128r1;
__m128i *dstrgb128r0, *dstrgb128r1;
__m64 *srcu64, *srcv64;
int x, y;
ysub = _mm_set1_epi32(0x00100010);
uvsub = _mm_set1_epi32(0x00800080);
facy = _mm_set1_epi32(0x004a004a);
facrv = _mm_set1_epi32(0x00660066);
facgu = _mm_set1_epi32(0x00190019);
facgv = _mm_set1_epi32(0x00340034);
facbu = _mm_set1_epi32(0x00810081);
zero = _mm_set1_epi32(0x00000000);
for (y = 0; y < height; y += 2) {
srcy128r0 = (__m128i *)(yp + sy*y);
srcy128r1 = (__m128i *)(yp + sy*y + sy);
srcu64 = (__m64 *)(up + suv*(y / 2));
srcv64 = (__m64 *)(vp + suv*(y / 2));
dstrgb128r0 = (__m128i *)(rgb + srgb*y);
dstrgb128r1 = (__m128i *)(rgb + srgb*y + srgb);
for (x = 0; x < width; x += 16) {
u0 = _mm_loadl_epi64((__m128i *)srcu64); srcu64++;
v0 = _mm_loadl_epi64((__m128i *)srcv64); srcv64++;
y0r0 = _mm_loadu_si128(srcy128r0++);
y0r1 = _mm_loadu_si128(srcy128r1++);
// constant y factors
y00r0 = _mm_mullo_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r0, zero), ysub), facy);
y01r0 = _mm_mullo_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r0, zero), ysub), facy);
y00r1 = _mm_mullo_epi16(_mm_sub_epi16(_mm_unpacklo_epi8(y0r1, zero), ysub), facy);
y01r1 = _mm_mullo_epi16(_mm_sub_epi16(_mm_unpackhi_epi8(y0r1, zero), ysub), facy);
// expand u and v so they're aligned with y values
u0 = _mm_unpacklo_epi8(u0, zero);
u00 = _mm_sub_epi16(_mm_unpacklo_epi16(u0, u0), uvsub);
u01 = _mm_sub_epi16(_mm_unpackhi_epi16(u0, u0), uvsub);
v0 = _mm_unpacklo_epi8(v0, zero);
v00 = _mm_sub_epi16(_mm_unpacklo_epi16(v0, v0), uvsub);
v01 = _mm_sub_epi16(_mm_unpackhi_epi16(v0, v0), uvsub);
// common factors on both rows.
rv00 = _mm_mullo_epi16(facrv, v00);
rv01 = _mm_mullo_epi16(facrv, v01);
gu00 = _mm_mullo_epi16(facgu, u00);
gu01 = _mm_mullo_epi16(facgu, u01);
gv00 = _mm_mullo_epi16(facgv, v00);
gv01 = _mm_mullo_epi16(facgv, v01);
bu00 = _mm_mullo_epi16(facbu, u00);
bu01 = _mm_mullo_epi16(facbu, u01);
// row 0
r00 = _mm_srai_epi16(_mm_add_epi16(y00r0, rv00), 6);
r01 = _mm_srai_epi16(_mm_add_epi16(y01r0, rv01), 6);
g00 = _mm_srai_epi16(_mm_sub_epi16(_mm_sub_epi16(y00r0, gu00), gv00), 6);
g01 = _mm_srai_epi16(_mm_sub_epi16(_mm_sub_epi16(y01r0, gu01), gv01), 6);
b00 = _mm_srai_epi16(_mm_add_epi16(y00r0, bu00), 6);
b01 = _mm_srai_epi16(_mm_add_epi16(y01r0, bu01), 6);
r00 = _mm_packus_epi16(r00, r01); // rrrr.. saturated
g00 = _mm_packus_epi16(g00, g01); // gggg.. saturated
b00 = _mm_packus_epi16(b00, b01); // bbbb.. saturated
r01 = _mm_unpacklo_epi8(b00, zero); // 0r0r..
gbgb = _mm_unpacklo_epi8(r00, g00); // gbgb..
rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8(b00, zero);
gbgb = _mm_unpackhi_epi8(r00, g00);
rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
_mm_storeu_si128(dstrgb128r0++, rgb0123);
_mm_storeu_si128(dstrgb128r0++, rgb4567);
_mm_storeu_si128(dstrgb128r0++, rgb89ab);
_mm_storeu_si128(dstrgb128r0++, rgbcdef);
// row 1
r00 = _mm_srai_epi16(_mm_add_epi16(y00r1, rv00), 6);
r01 = _mm_srai_epi16(_mm_add_epi16(y01r1, rv01), 6);
g00 = _mm_srai_epi16(_mm_sub_epi16(_mm_sub_epi16(y00r1, gu00), gv00), 6);
g01 = _mm_srai_epi16(_mm_sub_epi16(_mm_sub_epi16(y01r1, gu01), gv01), 6);
b00 = _mm_srai_epi16(_mm_add_epi16(y00r1, bu00), 6);
b01 = _mm_srai_epi16(_mm_add_epi16(y01r1, bu01), 6);
r00 = _mm_packus_epi16(r00, r01); // rrrr.. saturated
g00 = _mm_packus_epi16(g00, g01); // gggg.. saturated
b00 = _mm_packus_epi16(b00, b01); // bbbb.. saturated
r01 = _mm_unpacklo_epi8(b00, zero); // 0r0r..
gbgb = _mm_unpacklo_epi8(r00, g00); // gbgb..
rgb0123 = _mm_unpacklo_epi16(gbgb, r01); // 0rgb0rgb..
rgb4567 = _mm_unpackhi_epi16(gbgb, r01); // 0rgb0rgb..
r01 = _mm_unpackhi_epi8(b00, zero);
gbgb = _mm_unpackhi_epi8(r00, g00);
rgb89ab = _mm_unpacklo_epi16(gbgb, r01);
rgbcdef = _mm_unpackhi_epi16(gbgb, r01);
_mm_storeu_si128(dstrgb128r1++, rgb0123);
_mm_storeu_si128(dstrgb128r1++, rgb4567);
_mm_storeu_si128(dstrgb128r1++, rgb89ab);
_mm_storeu_si128(dstrgb128r1++, rgbcdef);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment