test_vinsertd.c:
#include <stdio.h>
#include <stdint.h>
#include <altivec.h>
typedef union {
vector unsigned char v;
uint64_t u64[2];
uint32_t u32[4];
uint8_t u8[16];
double d64[2];
} v128;
void print_v128(const char* label, v128 val) {
printf("%s: elem[0](low)=0x%016lx elem[1](high)=0x%016lx\n",
label, val.u64[0], val.u64[1]);
}
int main() {
printf("=== vinsertd UIM behavior in LE mode ===\n\n");
v128 base;
base.u64[0] = 0xAAAAAAAAAAAAAAAAULL;
base.u64[1] = 0xBBBBBBBBBBBBBBBBULL;
print_v128("base", base);
// vinsertd source: ISA says it takes dw0 of VRB
// On LE, dw0 = high addressed dw = elem[1] in C array
v128 src;
src.u64[0] = 0xDEADDEADDEADDEADULL;
src.u64[1] = 0x1111111111111111ULL;
print_v128("src (1111 in elem[1], DEAD in elem[0])", src);
printf("\n--- Testing vinsertd with different UIM values ---\n");
{
v128 result = base;
__asm__ volatile("vinsertd %0, %1, 0" : "+v"(result.v) : "v"(src.v));
print_v128("vinsertd UIM=0", result);
}
{
v128 result = base;
__asm__ volatile("vinsertd %0, %1, 1" : "+v"(result.v) : "v"(src.v));
print_v128("vinsertd UIM=1", result);
}
{
v128 result = base;
__asm__ volatile("vinsertd %0, %1, 8" : "+v"(result.v) : "v"(src.v));
print_v128("vinsertd UIM=8", result);
}
printf("\n=== vextractd UIM behavior in LE mode ===\n");
{
v128 result = {{0}};
__asm__ volatile("vextractd %0, %1, 0" : "=v"(result.v) : "v"(base.v));
print_v128("vextractd UIM=0 from base", result);
}
{
v128 result = {{0}};
__asm__ volatile("vextractd %0, %1, 1" : "=v"(result.v) : "v"(base.v));
print_v128("vextractd UIM=1 from base", result);
}
{
v128 result = {{0}};
__asm__ volatile("vextractd %0, %1, 8" : "=v"(result.v) : "v"(base.v));
print_v128("vextractd UIM=8 from base", result);
}
printf("\n=== MTVSRDD / MFVSRD / MFVSRLD reference ===\n");
{
v128 result;
uint64_t ra = 0xAAAAAAAAAAAAAAAAULL;
uint64_t rb = 0xBBBBBBBBBBBBBBBBULL;
__asm__ volatile("mtvsrdd %x0, %1, %2" : "=v"(result.v) : "r"(ra), "r"(rb));
printf("MTVSRDD(RA=0xAAAA, RB=0xBBBB):\n");
print_v128(" result", result);
}
{
uint64_t hi, lo;
__asm__ volatile("mfvsrd %0, %x2\n\tmfvsrld %1, %x2"
: "=r"(hi), "=r"(lo) : "v"(base.v));
printf("From base (elem[0]=0xAAAA, elem[1]=0xBBBB):\n");
printf(" MFVSRD = 0x%016lx\n", hi);
printf(" MFVSRLD = 0x%016lx\n", lo);
}
printf("\n=== vinsertw UIM behavior in LE mode ===\n");
v128 wbase;
wbase.u32[0] = 0x00000000;
wbase.u32[1] = 0x11111111;
wbase.u32[2] = 0x22222222;
wbase.u32[3] = 0x33333333;
printf("wbase: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
wbase.u32[0], wbase.u32[1], wbase.u32[2], wbase.u32[3]);
// vinsertw source: word 1 of VRB in BE (= high word of dw0)
// Put 0xFFFFFFFF in that position
v128 wsrc;
wsrc.u64[0] = 0;
wsrc.u64[1] = 0xFFFFFFFF00000000ULL; // dw0 high word = 0xFFFFFFFF
printf("wsrc: elem[0]=0x%016lx elem[1]=0x%016lx\n", wsrc.u64[0], wsrc.u64[1]);
for (int uim = 0; uim <= 12; uim += 4) {
v128 result = wbase;
switch(uim) {
case 0: __asm__ volatile("vinsertw %0, %1, 0" : "+v"(result.v) : "v"(wsrc.v)); break;
case 4: __asm__ volatile("vinsertw %0, %1, 4" : "+v"(result.v) : "v"(wsrc.v)); break;
case 8: __asm__ volatile("vinsertw %0, %1, 8" : "+v"(result.v) : "v"(wsrc.v)); break;
case 12: __asm__ volatile("vinsertw %0, %1, 12" : "+v"(result.v) : "v"(wsrc.v)); break;
}
printf("vinsertw UIM=%2d: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
uim, result.u32[0], result.u32[1], result.u32[2], result.u32[3]);
}
printf("\n=== vextractuw UIM behavior in LE mode ===\n");
// Extract from wbase
for (int uim = 0; uim <= 12; uim += 4) {
v128 result = {{0}};
switch(uim) {
case 0: __asm__ volatile("vextractuw %0, %1, 0" : "=v"(result.v) : "v"(wbase.v)); break;
case 4: __asm__ volatile("vextractuw %0, %1, 4" : "=v"(result.v) : "v"(wbase.v)); break;
case 8: __asm__ volatile("vextractuw %0, %1, 8" : "=v"(result.v) : "v"(wbase.v)); break;
case 12: __asm__ volatile("vextractuw %0, %1, 12" : "=v"(result.v) : "v"(wbase.v)); break;
}
printf("vextractuw UIM=%2d: elem[0]=0x%016lx elem[1]=0x%016lx\n",
uim, result.u64[0], result.u64[1]);
}
printf("\n=== vinserth UIM behavior in LE mode ===\n");
v128 hbase;
for (int i = 0; i < 8; i++) hbase.u32[i/2] |= 0; // clear
hbase.u64[0] = 0x3333333322222222ULL; // hw: 2222 2222 3333 3333
hbase.u64[1] = 0x7777777766666666ULL;
printf("hbase: ");
for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, ((uint16_t*)&hbase)[i]);
printf("\n");
// vinserth source: hw 3 of VRB in BE order
v128 hsrc;
hsrc.u64[0] = 0;
hsrc.u64[1] = 0x0000FFFF00000000ULL; // hw3 of dw0 in BE
for (int uim = 0; uim <= 14; uim += 2) {
v128 result = hbase;
switch(uim) {
case 0: __asm__ volatile("vinserth %0, %1, 0" : "+v"(result.v) : "v"(hsrc.v)); break;
case 2: __asm__ volatile("vinserth %0, %1, 2" : "+v"(result.v) : "v"(hsrc.v)); break;
case 4: __asm__ volatile("vinserth %0, %1, 4" : "+v"(result.v) : "v"(hsrc.v)); break;
case 6: __asm__ volatile("vinserth %0, %1, 6" : "+v"(result.v) : "v"(hsrc.v)); break;
case 8: __asm__ volatile("vinserth %0, %1, 8" : "+v"(result.v) : "v"(hsrc.v)); break;
case 10: __asm__ volatile("vinserth %0, %1, 10" : "+v"(result.v) : "v"(hsrc.v)); break;
case 12: __asm__ volatile("vinserth %0, %1, 12" : "+v"(result.v) : "v"(hsrc.v)); break;
case 14: __asm__ volatile("vinserth %0, %1, 14" : "+v"(result.v) : "v"(hsrc.v)); break;
}
printf("vinserth UIM=%2d: ", uim);
for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, ((uint16_t*)&result)[i]);
printf("\n");
}
return 0;
}$ gcc -O0 -maltivec -mvsx -mcpu=power9 -o /tmp/test_vinsertd /tmp/test_vinsertd.c && /tmp/test_vinsertd
=== vinsertd UIM behavior in LE mode ===
base: elem[0](low)=0xaaaaaaaaaaaaaaaa elem[1](high)=0xbbbbbbbbbbbbbbbb
src (1111 in elem[1], DEAD in elem[0]): elem[0](low)=0xdeaddeaddeaddead elem[1](high)=0x1111111111111111
--- Testing vinsertd with different UIM values ---
vinsertd UIM=0: elem[0](low)=0xaaaaaaaaaaaaaaaa elem[1](high)=0x1111111111111111
vinsertd UIM=1: elem[0](low)=0x11aaaaaaaaaaaaaa elem[1](high)=0xbb11111111111111
vinsertd UIM=8: elem[0](low)=0x1111111111111111 elem[1](high)=0xbbbbbbbbbbbbbbbb
=== vextractd UIM behavior in LE mode ===
vextractd UIM=0 from base: elem[0](low)=0x0000000000000000 elem[1](high)=0xbbbbbbbbbbbbbbbb
vextractd UIM=1 from base: elem[0](low)=0x0000000000000000 elem[1](high)=0xbbbbbbbbbbbbbbaa
vextractd UIM=8 from base: elem[0](low)=0x0000000000000000 elem[1](high)=0xaaaaaaaaaaaaaaaa
=== MTVSRDD / MFVSRD / MFVSRLD reference ===
MTVSRDD(RA=0xAAAA, RB=0xBBBB):
result: elem[0](low)=0xbbbbbbbbbbbbbbbb elem[1](high)=0xaaaaaaaaaaaaaaaa
From base (elem[0]=0xAAAA, elem[1]=0xBBBB):
MFVSRD = 0xbbbbbbbbbbbbbbbb
MFVSRLD = 0xaaaaaaaaaaaaaaaa
=== vinsertw UIM behavior in LE mode ===
wbase: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x33333333
wsrc: elem[0]=0x0000000000000000 elem[1]=0xffffffff00000000
vinsertw UIM= 0: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x00000000
vinsertw UIM= 4: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x00000000 w[3]=0x33333333
vinsertw UIM= 8: w[0]=0x00000000 w[1]=0x00000000 w[2]=0x22222222 w[3]=0x33333333
vinsertw UIM=12: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x33333333
=== vextractuw UIM behavior in LE mode ===
vextractuw UIM= 0: elem[0]=0x0000000000000000 elem[1]=0x0000000033333333
vextractuw UIM= 4: elem[0]=0x0000000000000000 elem[1]=0x0000000022222222
vextractuw UIM= 8: elem[0]=0x0000000000000000 elem[1]=0x0000000011111111
vextractuw UIM=12: elem[0]=0x0000000000000000 elem[1]=0x0000000000000000
=== vinserth UIM behavior in LE mode ===
hbase: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM= 0: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x0000
vinserth UIM= 2: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x0000 hw[7]=0x7777
vinserth UIM= 4: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x0000 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM= 6: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x0000 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM= 8: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x0000 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM=10: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x0000 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM=12: hw[0]=0x2222 hw[1]=0x0000 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777
vinserth UIM=14: hw[0]=0x0000 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777test_vextract2.c:
#include <stdio.h>
#include <stdint.h>
#include <altivec.h>
typedef union {
vector unsigned char v;
uint64_t u64[2];
uint32_t u32[4];
uint16_t u16[8];
} v128;
int main() {
printf("=== vextractuh UIM behavior in LE mode ===\n");
v128 hbase;
hbase.u16[0] = 0x0000;
hbase.u16[1] = 0x1111;
hbase.u16[2] = 0x2222;
hbase.u16[3] = 0x3333;
hbase.u16[4] = 0x4444;
hbase.u16[5] = 0x5555;
hbase.u16[6] = 0x6666;
hbase.u16[7] = 0x7777;
printf("hbase: ");
for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, hbase.u16[i]);
printf("\n");
for (int uim = 0; uim <= 14; uim += 2) {
v128 result = {{0}};
switch(uim) {
case 0: __asm__ volatile("vextractuh %0, %1, 0" : "=v"(result.v) : "v"(hbase.v)); break;
case 2: __asm__ volatile("vextractuh %0, %1, 2" : "=v"(result.v) : "v"(hbase.v)); break;
case 4: __asm__ volatile("vextractuh %0, %1, 4" : "=v"(result.v) : "v"(hbase.v)); break;
case 6: __asm__ volatile("vextractuh %0, %1, 6" : "=v"(result.v) : "v"(hbase.v)); break;
case 8: __asm__ volatile("vextractuh %0, %1, 8" : "=v"(result.v) : "v"(hbase.v)); break;
case 10: __asm__ volatile("vextractuh %0, %1, 10" : "=v"(result.v) : "v"(hbase.v)); break;
case 12: __asm__ volatile("vextractuh %0, %1, 12" : "=v"(result.v) : "v"(hbase.v)); break;
case 14: __asm__ volatile("vextractuh %0, %1, 14" : "=v"(result.v) : "v"(hbase.v)); break;
}
printf("vextractuh UIM=%2d: result dw1(high)=0x%016lx\n", uim, result.u64[1]);
}
printf("\n=== Summary: UIM to x86 element mapping ===\n");
printf("vinsertd/vextractd:\n");
printf(" UIM=0 → LE elem[1] (x86 high qword)\n");
printf(" UIM=8 → LE elem[0] (x86 low qword)\n");
printf("vinsertw/vextractuw:\n");
printf(" UIM=0 → LE w[3] (x86 word 3, highest)\n");
printf(" UIM=4 → LE w[2] (x86 word 2)\n");
printf(" UIM=8 → LE w[1] (x86 word 1)\n");
printf(" UIM=12 → LE w[0] (x86 word 0, lowest)\n");
printf("\n=== What the vinsertw source actually extracts ===\n");
// The ISA says vinsertw takes "word 1 of VRB" in BE order
// Let's put different values in each word position and see what gets inserted
v128 wsrc_test;
wsrc_test.u32[0] = 0xAA000000;
wsrc_test.u32[1] = 0xBB111111;
wsrc_test.u32[2] = 0xCC222222;
wsrc_test.u32[3] = 0xDD333333;
printf("wsrc_test: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
wsrc_test.u32[0], wsrc_test.u32[1], wsrc_test.u32[2], wsrc_test.u32[3]);
v128 zero;
zero.u64[0] = 0; zero.u64[1] = 0;
{
v128 result = zero;
__asm__ volatile("vinsertw %0, %1, 12" : "+v"(result.v) : "v"(wsrc_test.v));
printf("vinsertw UIM=12 into zero: w[0]=0x%08x (shows which word of src is used)\n", result.u32[0]);
}
printf("\n=== What vinsertd source actually extracts ===\n");
v128 dsrc_test;
dsrc_test.u64[0] = 0xAAAAAAAAAAAAAAAAULL; // LE low = BE dw1
dsrc_test.u64[1] = 0xBBBBBBBBBBBBBBBBULL; // LE high = BE dw0
printf("dsrc_test: elem[0]=0xAAAA elem[1]=0xBBBB\n");
{
v128 result = zero;
__asm__ volatile("vinsertd %0, %1, 8" : "+v"(result.v) : "v"(dsrc_test.v));
printf("vinsertd UIM=8 into zero: elem[0]=0x%016lx (shows which dw of src)\n", result.u64[0]);
}
return 0;
}$ gcc -O0 -maltivec -mvsx -mcpu=power9 -o /tmp/test_vextract2 /tmp/test_vextract2.c && /tmp/test_vextract2
=== vextractuh UIM behavior in LE mode ===
hbase: hw[0]=0x0000 hw[1]=0x1111 hw[2]=0x2222 hw[3]=0x3333 hw[4]=0x4444 hw[5]=0x5555 hw[6]=0x6666 hw[7]=0x7777
vextractuh UIM= 0: result dw1(high)=0x0000000000007777
vextractuh UIM= 2: result dw1(high)=0x0000000000006666
vextractuh UIM= 4: result dw1(high)=0x0000000000005555
vextractuh UIM= 6: result dw1(high)=0x0000000000004444
vextractuh UIM= 8: result dw1(high)=0x0000000000003333
vextractuh UIM=10: result dw1(high)=0x0000000000002222
vextractuh UIM=12: result dw1(high)=0x0000000000001111
vextractuh UIM=14: result dw1(high)=0x0000000000000000
=== Summary: UIM to x86 element mapping ===
vinsertd/vextractd:
UIM=0 → LE elem[1] (x86 high qword)
UIM=8 → LE elem[0] (x86 low qword)
vinsertw/vextractuw:
UIM=0 → LE w[3] (x86 word 3, highest)
UIM=4 → LE w[2] (x86 word 2)
UIM=8 → LE w[1] (x86 word 1)
UIM=12 → LE w[0] (x86 word 0, lowest)
=== What the vinsertw source actually extracts ===
wsrc_test: w[0]=0xaa000000 w[1]=0xbb111111 w[2]=0xcc222222 w[3]=0xdd333333
vinsertw UIM=12 into zero: w[0]=0xcc222222 (shows which word of src is used)
=== What vinsertd source actually extracts ===
dsrc_test: elem[0]=0xAAAA elem[1]=0xBBBB
vinsertd UIM=8 into zero: elem[0]=0xbbbbbbbbbbbbbbbb (shows which dw of src)