Created
April 11, 2020 17:11
-
-
Save beezly/4fe7a086ad1a1a7b02b2616a8ce08c9b to your computer and use it in GitHub Desktop.
MMX/SSE demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
On a VM I got... | |
Allocating 134217728 bytes of RAM and writing a pattern to it. | |
allocate_ram: Took 15770 cycles | |
before first x86_add: Address 0xafd54010: 00000000000000000000000000000000 | |
after first x86_add: Address 0xafd54010: 02020202020202020202020202020202 | |
after last x86_add: Address 0xafd54010: cacacacacacacacacacacacacacacaca | |
x86_add: Took 13213002422 cycles | |
before first mmx_add: Address 0xafd54010: cacacacacacacacacacacacacacacaca | |
after first mmx add: Address 0xafd54010: cccccccccccccccccccccccccccccccc | |
after last mmx add: Address 0xafd54010: 94949494949494949494949494949494 | |
mmx_add: Took 3867184649 cycles | |
mmx_add is 3.42 quicker than x86_add | |
before first sse_add: Address 0xafd54010: 94949494949494949494949494949494 | |
after first sse_add: Address 0xafd54010: 96969696969696969696969696969696 | |
after last sse_add: Address 0xafd54010: 5e5e5e5e5e5e5e5e5e5e5e5e5e5e5e5e | |
sse_add: Took 3817400474 cycles | |
sse_add is 3.46 quicker than x86_add | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <x86intrin.h> | |
// Compile with gcc -o demo -mmmx -msse4 ./demo.c | |
#define RAM_SIZE 16 * 1024 * 1024 * 8 | |
#define CYCLE_COUNT 100 | |
#define timing(task, start,end) printf("%s: Took %llu cycles\n", task, (end-start)) | |
#define repeat(x) for(int i=0; i<CYCLE_COUNT; i++) { x; } | |
void display(char *msg, char *ram, int length) { | |
printf("%s: Address %p: ", msg, ram); | |
int i =0; | |
unsigned char* byte_array = ram; | |
while (i < length) | |
{ | |
printf("%02hhx", byte_array[i]); | |
i++; | |
} | |
printf("\n"); | |
return; | |
} | |
char *allocate_ram(size_t size) { | |
return calloc(size, sizeof(char)); | |
} | |
void x86_add(uint8_t v, char *ram) { | |
asm __volatile( | |
"mov $0, %%eax \n" | |
"1: \n" | |
"addb %2, (%0,%%eax) \n" | |
"inc %%eax \n" | |
"cmp %1, %%eax \n" | |
"jne 1b" | |
: | |
: "r" (ram), "i" (RAM_SIZE), "r" (v) | |
: "memory", "%eax" | |
); | |
} | |
void mmx_add(uint8_t v, char *ram) { | |
uint64_t vmmx; | |
vmmx=v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
vmmx=(vmmx << 8) + v; | |
asm volatile ( | |
"mov %0, %%esi \n" | |
"addl %1, %%esi \n" | |
"1: \n" | |
"movq (%2), %%mm0 \n" | |
"paddb (%0),%%mm0 \n" | |
"movq %%mm0, (%0) \n" | |
"add $8, %0 \n" | |
"cmp %%esi, %0 \n" | |
"jne 1b \n" | |
"emms \n" | |
: | |
: "r" (ram), "i" (RAM_SIZE), "r" (&vmmx) | |
: "memory", "%esi", "mm0" | |
); | |
} | |
void sse_add(uint8_t v, char *ram) { | |
asm __volatile( | |
"mov %1, %%eax \n" | |
"add %0, %%eax \n" // find our end point | |
"mov %2, %%bh \n" // fill xmm1 with our add value | |
"mov %2, %%bl \n" | |
"pinsrw $0, %%ebx, %%xmm1 \n" | |
"pinsrw $1, %%ebx, %%xmm1 \n" | |
"pinsrw $2, %%ebx, %%xmm1 \n" | |
"pinsrw $3, %%ebx, %%xmm1 \n" | |
"pinsrw $4, %%ebx, %%xmm1 \n" | |
"pinsrw $5, %%ebx, %%xmm1 \n" | |
"pinsrw $6, %%ebx, %%xmm1 \n" | |
"pinsrw $7, %%ebx, %%xmm1 \n" | |
"2: \n" | |
"vpaddb (%0), %%xmm1, %%xmm0 \n" // add xm1 to data @%2 and write into xm0 | |
"movq %%xmm0, (%0) \n" | |
"movhlps %%xmm0, %%xmm0 \n" //shift top half of xmm0 into bottom half. | |
"add $8, %0 \n" // skip 8 bytes | |
"movq %%xmm0, (%0) \n" | |
"add $8, %0 \n" // skip 8 bytes | |
"cmp %0, %%eax \n" | |
"jne 2b" | |
: | |
: "r" (ram), "i" (RAM_SIZE), "r" (v) | |
: "memory", "%eax", "%ebx", "%xmm1", "%xmm0" | |
); | |
} | |
int main(int argc, char *argv) { | |
char *ram; | |
uint64_t start_time, end_time; | |
printf("Allocating %i bytes of RAM and writing a pattern to it.\n", RAM_SIZE); | |
start_time=__rdtsc(); | |
ram=allocate_ram(RAM_SIZE); | |
end_time=__rdtsc(); | |
timing("allocate_ram",start_time,end_time); | |
display("before first x86_add", ram,16); | |
start_time=__rdtsc(); | |
x86_add(2,ram); | |
display(" after first x86_add",ram,16); | |
repeat(x86_add(2,ram)); | |
end_time=__rdtsc(); | |
display(" after last x86_add",ram,16); | |
timing("x86_add",start_time,end_time); | |
unsigned long long x86_add_time = end_time-start_time; | |
display("before first mmx_add",ram,16); | |
start_time=__rdtsc(); | |
mmx_add(2,ram); | |
display(" after first mmx add",ram,16); | |
repeat(mmx_add(2,ram)); | |
end_time=__rdtsc(); | |
display(" after last mmx add",ram,16); | |
timing("mmx_add",start_time,end_time); | |
unsigned long long mmx_add_time = end_time-start_time; | |
printf("mmx_add is %.2f quicker than x86_add\n", (float) x86_add_time / (float) mmx_add_time); | |
//sse | |
display("before first sse_add",ram,16); | |
start_time=__rdtsc(); | |
sse_add(2,ram); | |
display(" after first sse_add",ram,16); | |
repeat(sse_add(2,ram)); | |
end_time=__rdtsc(); | |
display(" after last sse_add",ram,16); | |
timing("sse_add",start_time,end_time); | |
unsigned long long sse_add_time = end_time-start_time; | |
printf("sse_add is %.2f quicker than x86_add\n", (float) x86_add_time / (float) sse_add_time); | |
exit(0); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment