Skip to content

Instantly share code, notes, and snippets.

@runlevel5
Created February 19, 2026 12:05
Show Gist options
  • Select an option

  • Save runlevel5/d3e851177ea1c72f4b60579e7f61b554 to your computer and use it in GitHub Desktop.

Select an option

Save runlevel5/d3e851177ea1c72f4b60579e7f61b554 to your computer and use it in GitHub Desktop.
PPC64LE's behaviour of vinsertd/vextractd

test_vinsertd.c:

#include <stdio.h>
#include <stdint.h>
#include <altivec.h>
typedef union {
    vector unsigned char v;
    uint64_t u64[2];
    uint32_t u32[4];
    uint8_t u8[16];
    double d64[2];
} v128;
void print_v128(const char* label, v128 val) {
    printf("%s: elem[0](low)=0x%016lx  elem[1](high)=0x%016lx\n",
           label, val.u64[0], val.u64[1]);
}
int main() {
    printf("=== vinsertd UIM behavior in LE mode ===\n\n");
    v128 base;
    base.u64[0] = 0xAAAAAAAAAAAAAAAAULL;
    base.u64[1] = 0xBBBBBBBBBBBBBBBBULL;
    print_v128("base", base);
    // vinsertd source: ISA says it takes dw0 of VRB
    // On LE, dw0 = high addressed dw = elem[1] in C array
    v128 src;
    src.u64[0] = 0xDEADDEADDEADDEADULL;
    src.u64[1] = 0x1111111111111111ULL;
    print_v128("src (1111 in elem[1], DEAD in elem[0])", src);
    printf("\n--- Testing vinsertd with different UIM values ---\n");
    {
        v128 result = base;
        __asm__ volatile("vinsertd %0, %1, 0" : "+v"(result.v) : "v"(src.v));
        print_v128("vinsertd UIM=0", result);
    }
    {
        v128 result = base;
        __asm__ volatile("vinsertd %0, %1, 1" : "+v"(result.v) : "v"(src.v));
        print_v128("vinsertd UIM=1", result);
    }
    {
        v128 result = base;
        __asm__ volatile("vinsertd %0, %1, 8" : "+v"(result.v) : "v"(src.v));
        print_v128("vinsertd UIM=8", result);
    }
    printf("\n=== vextractd UIM behavior in LE mode ===\n");
    {
        v128 result = {{0}};
        __asm__ volatile("vextractd %0, %1, 0" : "=v"(result.v) : "v"(base.v));
        print_v128("vextractd UIM=0 from base", result);
    }
    {
        v128 result = {{0}};
        __asm__ volatile("vextractd %0, %1, 1" : "=v"(result.v) : "v"(base.v));
        print_v128("vextractd UIM=1 from base", result);
    }
    {
        v128 result = {{0}};
        __asm__ volatile("vextractd %0, %1, 8" : "=v"(result.v) : "v"(base.v));
        print_v128("vextractd UIM=8 from base", result);
    }
    printf("\n=== MTVSRDD / MFVSRD / MFVSRLD reference ===\n");
    {
        v128 result;
        uint64_t ra = 0xAAAAAAAAAAAAAAAAULL;
        uint64_t rb = 0xBBBBBBBBBBBBBBBBULL;
        __asm__ volatile("mtvsrdd %x0, %1, %2" : "=v"(result.v) : "r"(ra), "r"(rb));
        printf("MTVSRDD(RA=0xAAAA, RB=0xBBBB):\n");
        print_v128("  result", result);
    }
    {
        uint64_t hi, lo;
        __asm__ volatile("mfvsrd %0, %x2\n\tmfvsrld %1, %x2"
                         : "=r"(hi), "=r"(lo) : "v"(base.v));
        printf("From base (elem[0]=0xAAAA, elem[1]=0xBBBB):\n");
        printf("  MFVSRD  = 0x%016lx\n", hi);
        printf("  MFVSRLD = 0x%016lx\n", lo);
    }
    printf("\n=== vinsertw UIM behavior in LE mode ===\n");
    v128 wbase;
    wbase.u32[0] = 0x00000000;
    wbase.u32[1] = 0x11111111;
    wbase.u32[2] = 0x22222222;
    wbase.u32[3] = 0x33333333;
    printf("wbase: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
           wbase.u32[0], wbase.u32[1], wbase.u32[2], wbase.u32[3]);
    // vinsertw source: word 1 of VRB in BE (= high word of dw0)
    // Put 0xFFFFFFFF in that position
    v128 wsrc;
    wsrc.u64[0] = 0;
    wsrc.u64[1] = 0xFFFFFFFF00000000ULL;  // dw0 high word = 0xFFFFFFFF
    printf("wsrc: elem[0]=0x%016lx  elem[1]=0x%016lx\n", wsrc.u64[0], wsrc.u64[1]);
    for (int uim = 0; uim <= 12; uim += 4) {
        v128 result = wbase;
        switch(uim) {
            case 0: __asm__ volatile("vinsertw %0, %1, 0" : "+v"(result.v) : "v"(wsrc.v)); break;
            case 4: __asm__ volatile("vinsertw %0, %1, 4" : "+v"(result.v) : "v"(wsrc.v)); break;
            case 8: __asm__ volatile("vinsertw %0, %1, 8" : "+v"(result.v) : "v"(wsrc.v)); break;
            case 12: __asm__ volatile("vinsertw %0, %1, 12" : "+v"(result.v) : "v"(wsrc.v)); break;
        }
        printf("vinsertw UIM=%2d: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
               uim, result.u32[0], result.u32[1], result.u32[2], result.u32[3]);
    }
    printf("\n=== vextractuw UIM behavior in LE mode ===\n");
    // Extract from wbase
    for (int uim = 0; uim <= 12; uim += 4) {
        v128 result = {{0}};
        switch(uim) {
            case 0: __asm__ volatile("vextractuw %0, %1, 0" : "=v"(result.v) : "v"(wbase.v)); break;
            case 4: __asm__ volatile("vextractuw %0, %1, 4" : "=v"(result.v) : "v"(wbase.v)); break;
            case 8: __asm__ volatile("vextractuw %0, %1, 8" : "=v"(result.v) : "v"(wbase.v)); break;
            case 12: __asm__ volatile("vextractuw %0, %1, 12" : "=v"(result.v) : "v"(wbase.v)); break;
        }
        printf("vextractuw UIM=%2d: elem[0]=0x%016lx  elem[1]=0x%016lx\n",
               uim, result.u64[0], result.u64[1]);
    }
    printf("\n=== vinserth UIM behavior in LE mode ===\n");
    v128 hbase;
    for (int i = 0; i < 8; i++) hbase.u32[i/2] |= 0;  // clear
    hbase.u64[0] = 0x3333333322222222ULL;  // hw: 2222 2222 3333 3333
    hbase.u64[1] = 0x7777777766666666ULL;
    printf("hbase: ");
    for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, ((uint16_t*)&hbase)[i]);
    printf("\n");
    // vinserth source: hw 3 of VRB in BE order
    v128 hsrc;
    hsrc.u64[0] = 0;
    hsrc.u64[1] = 0x0000FFFF00000000ULL;  // hw3 of dw0 in BE
    for (int uim = 0; uim <= 14; uim += 2) {
        v128 result = hbase;
        switch(uim) {
            case 0: __asm__ volatile("vinserth %0, %1, 0" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 2: __asm__ volatile("vinserth %0, %1, 2" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 4: __asm__ volatile("vinserth %0, %1, 4" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 6: __asm__ volatile("vinserth %0, %1, 6" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 8: __asm__ volatile("vinserth %0, %1, 8" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 10: __asm__ volatile("vinserth %0, %1, 10" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 12: __asm__ volatile("vinserth %0, %1, 12" : "+v"(result.v) : "v"(hsrc.v)); break;
            case 14: __asm__ volatile("vinserth %0, %1, 14" : "+v"(result.v) : "v"(hsrc.v)); break;
        }
        printf("vinserth UIM=%2d: ", uim);
        for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, ((uint16_t*)&result)[i]);
        printf("\n");
    }
    return 0;
}
$ gcc -O0 -maltivec -mvsx -mcpu=power9 -o /tmp/test_vinsertd /tmp/test_vinsertd.c && /tmp/test_vinsertd

=== vinsertd UIM behavior in LE mode ===
base: elem[0](low)=0xaaaaaaaaaaaaaaaa  elem[1](high)=0xbbbbbbbbbbbbbbbb
src (1111 in elem[1], DEAD in elem[0]): elem[0](low)=0xdeaddeaddeaddead  elem[1](high)=0x1111111111111111
--- Testing vinsertd with different UIM values ---
vinsertd UIM=0: elem[0](low)=0xaaaaaaaaaaaaaaaa  elem[1](high)=0x1111111111111111
vinsertd UIM=1: elem[0](low)=0x11aaaaaaaaaaaaaa  elem[1](high)=0xbb11111111111111
vinsertd UIM=8: elem[0](low)=0x1111111111111111  elem[1](high)=0xbbbbbbbbbbbbbbbb
=== vextractd UIM behavior in LE mode ===
vextractd UIM=0 from base: elem[0](low)=0x0000000000000000  elem[1](high)=0xbbbbbbbbbbbbbbbb
vextractd UIM=1 from base: elem[0](low)=0x0000000000000000  elem[1](high)=0xbbbbbbbbbbbbbbaa
vextractd UIM=8 from base: elem[0](low)=0x0000000000000000  elem[1](high)=0xaaaaaaaaaaaaaaaa
=== MTVSRDD / MFVSRD / MFVSRLD reference ===
MTVSRDD(RA=0xAAAA, RB=0xBBBB):
  result: elem[0](low)=0xbbbbbbbbbbbbbbbb  elem[1](high)=0xaaaaaaaaaaaaaaaa
From base (elem[0]=0xAAAA, elem[1]=0xBBBB):
  MFVSRD  = 0xbbbbbbbbbbbbbbbb
  MFVSRLD = 0xaaaaaaaaaaaaaaaa
=== vinsertw UIM behavior in LE mode ===
wbase: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x33333333
wsrc: elem[0]=0x0000000000000000  elem[1]=0xffffffff00000000
vinsertw UIM= 0: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x00000000
vinsertw UIM= 4: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x00000000 w[3]=0x33333333
vinsertw UIM= 8: w[0]=0x00000000 w[1]=0x00000000 w[2]=0x22222222 w[3]=0x33333333
vinsertw UIM=12: w[0]=0x00000000 w[1]=0x11111111 w[2]=0x22222222 w[3]=0x33333333
=== vextractuw UIM behavior in LE mode ===
vextractuw UIM= 0: elem[0]=0x0000000000000000  elem[1]=0x0000000033333333
vextractuw UIM= 4: elem[0]=0x0000000000000000  elem[1]=0x0000000022222222
vextractuw UIM= 8: elem[0]=0x0000000000000000  elem[1]=0x0000000011111111
vextractuw UIM=12: elem[0]=0x0000000000000000  elem[1]=0x0000000000000000
=== vinserth UIM behavior in LE mode ===
hbase: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM= 0: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x0000 
vinserth UIM= 2: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x0000 hw[7]=0x7777 
vinserth UIM= 4: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x0000 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM= 6: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x0000 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM= 8: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x0000 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM=10: hw[0]=0x2222 hw[1]=0x2222 hw[2]=0x0000 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM=12: hw[0]=0x2222 hw[1]=0x0000 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777 
vinserth UIM=14: hw[0]=0x0000 hw[1]=0x2222 hw[2]=0x3333 hw[3]=0x3333 hw[4]=0x6666 hw[5]=0x6666 hw[6]=0x7777 hw[7]=0x7777

test_vextract2.c:

#include <stdio.h>
#include <stdint.h>
#include <altivec.h>
typedef union {
    vector unsigned char v;
    uint64_t u64[2];
    uint32_t u32[4];
    uint16_t u16[8];
} v128;
int main() {
    printf("=== vextractuh UIM behavior in LE mode ===\n");
    v128 hbase;
    hbase.u16[0] = 0x0000;
    hbase.u16[1] = 0x1111;
    hbase.u16[2] = 0x2222;
    hbase.u16[3] = 0x3333;
    hbase.u16[4] = 0x4444;
    hbase.u16[5] = 0x5555;
    hbase.u16[6] = 0x6666;
    hbase.u16[7] = 0x7777;
    printf("hbase: ");
    for (int i = 0; i < 8; i++) printf("hw[%d]=0x%04x ", i, hbase.u16[i]);
    printf("\n");
    for (int uim = 0; uim <= 14; uim += 2) {
        v128 result = {{0}};
        switch(uim) {
            case 0: __asm__ volatile("vextractuh %0, %1, 0" : "=v"(result.v) : "v"(hbase.v)); break;
            case 2: __asm__ volatile("vextractuh %0, %1, 2" : "=v"(result.v) : "v"(hbase.v)); break;
            case 4: __asm__ volatile("vextractuh %0, %1, 4" : "=v"(result.v) : "v"(hbase.v)); break;
            case 6: __asm__ volatile("vextractuh %0, %1, 6" : "=v"(result.v) : "v"(hbase.v)); break;
            case 8: __asm__ volatile("vextractuh %0, %1, 8" : "=v"(result.v) : "v"(hbase.v)); break;
            case 10: __asm__ volatile("vextractuh %0, %1, 10" : "=v"(result.v) : "v"(hbase.v)); break;
            case 12: __asm__ volatile("vextractuh %0, %1, 12" : "=v"(result.v) : "v"(hbase.v)); break;
            case 14: __asm__ volatile("vextractuh %0, %1, 14" : "=v"(result.v) : "v"(hbase.v)); break;
        }
        printf("vextractuh UIM=%2d: result dw1(high)=0x%016lx\n", uim, result.u64[1]);
    }
    printf("\n=== Summary: UIM to x86 element mapping ===\n");
    printf("vinsertd/vextractd:\n");
    printf("  UIM=0 → LE elem[1] (x86 high qword)\n");
    printf("  UIM=8 → LE elem[0] (x86 low qword)\n");
    printf("vinsertw/vextractuw:\n");
    printf("  UIM=0  → LE w[3] (x86 word 3, highest)\n");
    printf("  UIM=4  → LE w[2] (x86 word 2)\n");
    printf("  UIM=8  → LE w[1] (x86 word 1)\n");
    printf("  UIM=12 → LE w[0] (x86 word 0, lowest)\n");
    printf("\n=== What the vinsertw source actually extracts ===\n");
    // The ISA says vinsertw takes "word 1 of VRB" in BE order
    // Let's put different values in each word position and see what gets inserted
    v128 wsrc_test;
    wsrc_test.u32[0] = 0xAA000000;
    wsrc_test.u32[1] = 0xBB111111;
    wsrc_test.u32[2] = 0xCC222222;
    wsrc_test.u32[3] = 0xDD333333;
    printf("wsrc_test: w[0]=0x%08x w[1]=0x%08x w[2]=0x%08x w[3]=0x%08x\n",
           wsrc_test.u32[0], wsrc_test.u32[1], wsrc_test.u32[2], wsrc_test.u32[3]);
    v128 zero;
    zero.u64[0] = 0; zero.u64[1] = 0;
    {
        v128 result = zero;
        __asm__ volatile("vinsertw %0, %1, 12" : "+v"(result.v) : "v"(wsrc_test.v));
        printf("vinsertw UIM=12 into zero: w[0]=0x%08x (shows which word of src is used)\n", result.u32[0]);
    }
    printf("\n=== What vinsertd source actually extracts ===\n");
    v128 dsrc_test;
    dsrc_test.u64[0] = 0xAAAAAAAAAAAAAAAAULL;  // LE low = BE dw1
    dsrc_test.u64[1] = 0xBBBBBBBBBBBBBBBBULL;  // LE high = BE dw0
    printf("dsrc_test: elem[0]=0xAAAA  elem[1]=0xBBBB\n");
    {
        v128 result = zero;
        __asm__ volatile("vinsertd %0, %1, 8" : "+v"(result.v) : "v"(dsrc_test.v));
        printf("vinsertd UIM=8 into zero: elem[0]=0x%016lx (shows which dw of src)\n", result.u64[0]);
    }
    return 0;
}
$ gcc -O0 -maltivec -mvsx -mcpu=power9 -o /tmp/test_vextract2 /tmp/test_vextract2.c && /tmp/test_vextract2

=== vextractuh UIM behavior in LE mode ===
hbase: hw[0]=0x0000 hw[1]=0x1111 hw[2]=0x2222 hw[3]=0x3333 hw[4]=0x4444 hw[5]=0x5555 hw[6]=0x6666 hw[7]=0x7777 
vextractuh UIM= 0: result dw1(high)=0x0000000000007777
vextractuh UIM= 2: result dw1(high)=0x0000000000006666
vextractuh UIM= 4: result dw1(high)=0x0000000000005555
vextractuh UIM= 6: result dw1(high)=0x0000000000004444
vextractuh UIM= 8: result dw1(high)=0x0000000000003333
vextractuh UIM=10: result dw1(high)=0x0000000000002222
vextractuh UIM=12: result dw1(high)=0x0000000000001111
vextractuh UIM=14: result dw1(high)=0x0000000000000000
=== Summary: UIM to x86 element mapping ===
vinsertd/vextractd:
  UIM=0 → LE elem[1] (x86 high qword)
  UIM=8 → LE elem[0] (x86 low qword)
vinsertw/vextractuw:
  UIM=0  → LE w[3] (x86 word 3, highest)
  UIM=4  → LE w[2] (x86 word 2)
  UIM=8  → LE w[1] (x86 word 1)
  UIM=12 → LE w[0] (x86 word 0, lowest)
=== What the vinsertw source actually extracts ===
wsrc_test: w[0]=0xaa000000 w[1]=0xbb111111 w[2]=0xcc222222 w[3]=0xdd333333
vinsertw UIM=12 into zero: w[0]=0xcc222222 (shows which word of src is used)
=== What vinsertd source actually extracts ===
dsrc_test: elem[0]=0xAAAA  elem[1]=0xBBBB
vinsertd UIM=8 into zero: elem[0]=0xbbbbbbbbbbbbbbbb (shows which dw of src)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment