-
-
Save CryptoManiac/7dd1e7076b99e1018b398cb5d79f5d22 to your computer and use it in GitHub Desktop.
#include <sys/timeb.h> | |
#include <sys/types.h> | |
#ifdef _WIN32 | |
# include <windows.h> | |
#else | |
# include <sys/time.h> | |
# include <unistd.h> | |
#endif | |
#include <iostream> | |
#include <iomanip> | |
#include <cstdint> | |
#if !defined(__clang__) && !defined(__INTEL_COMPILER) | |
#define __volatile__ volatile | |
#else | |
#define __volatile__ | |
#endif | |
using namespace std; | |
double time_s() | |
{ | |
#if defined(_WIN32) | |
struct __timeb64 time_struct; | |
int64_t time_ms; | |
_ftime64_s(&time_struct); | |
time_ms = (int64_t)time_struct.time; | |
time_ms *= 1000; | |
time_ms += time_struct.millitm; | |
return (time_ms / 1000.0); | |
#else | |
struct timeval time_struct; | |
int64_t time_ms; | |
gettimeofday(&time_struct, 0); | |
time_ms = (int64_t)time_struct.tv_sec; | |
time_ms *= 1000; | |
time_ms += (time_struct.tv_usec / 1000); | |
return (time_ms / 1000.0); | |
#endif | |
} | |
#define HALT 0 | |
#define SET_X 1 | |
#define DEC_X 2 | |
#define JUMP_X_NZ 3 | |
#ifdef _MSC_VER | |
# ifndef __INTEL_COMPILER_BUILD_DATE | |
# ifdef _WIN64 | |
# error "MSVC64 isn't supported, please use clang or intel c++ compiler" | |
# else | |
# define STORE(index,label) __asm lea eax, label __asm mov edx, data\ | |
__asm mov [edx][index * TYPE data],eax | |
# define JUMP() { void* addr = *(ip++); __asm jmp addr } | |
# define JUMP_INDIRECT() { void* addr = data[*(code++)]; __asm jmp addr } | |
# endif | |
# else | |
# define STORE(index,label) data[index] = &&label | |
# define JUMP() goto **(ip++) | |
# define JUMP_INDIRECT() goto *data[*(code++)] | |
# endif | |
#else | |
# define STORE(index,label) data[index] = &&label | |
# define JUMP() goto **(ip++) | |
# define JUMP_INDIRECT() goto *data[*(code++)] | |
#endif | |
#define GET_OPCODES 0 | |
#define RUN 1 | |
void execute_direct(__volatile__ void** data, int operation) | |
{ | |
int x = 0; | |
__volatile__ void** ip = data; | |
if (operation == GET_OPCODES) goto get_opcodes; | |
JUMP(); | |
op_halt: | |
return; | |
op_set_x: | |
x = *((int*)(ip++)); | |
JUMP(); | |
op_dec_x: | |
--x; | |
JUMP(); | |
op_jump_x_nz: | |
{ | |
auto target = *(ip++); | |
if (x) { | |
ip = (__volatile__ void**)target; | |
} | |
JUMP(); | |
} | |
get_opcodes: | |
STORE(HALT, op_halt); | |
STORE(SET_X, op_set_x); | |
STORE(DEC_X, op_dec_x); | |
STORE(JUMP_X_NZ, op_jump_x_nz); | |
} | |
typedef void (*opcode)(void**& ip, int* x, bool* stopflag); | |
void __op_set_x(void**& ip, int* x, bool* stopflag) { | |
*x = reinterpret_cast<size_t>(*(ip + 1)); | |
ip+=2; | |
} | |
void __op_dec_x(void**& ip, int* x, bool* stopflag) { | |
--(*x); | |
ip++; | |
} | |
void __op_jump_x_nz(void**& ip, int* x, bool* stopflag) { | |
auto target = *(ip + 1); | |
if (*x) { | |
ip = (void**)target; | |
} else { | |
ip+=2; | |
} | |
} | |
void __op_halt(void**& ip, int* x, bool* stopflag) { | |
*stopflag = false; | |
} | |
void execute_directcall(void** data, int operation) { | |
int x = 0; | |
bool stopflag = true; | |
auto ip = data; | |
if (operation == GET_OPCODES) goto get_opcodes; | |
for (; stopflag ;) ((opcode)*ip)(ip, &x, &stopflag); | |
get_opcodes: | |
data[HALT] = (void*)&__op_halt; | |
data[SET_X] = (void*)&__op_set_x; | |
data[DEC_X] = (void*)&__op_dec_x; | |
data[JUMP_X_NZ] = (void*)&__op_jump_x_nz; | |
} | |
typedef void (*iopcode)(size_t*& ip, int* x, bool* stopflag); | |
void __iop_set_x(size_t*& ip, int* x, bool* stopflag) { | |
*x = *(ip + 1); | |
ip+=2; | |
} | |
void __iop_dec_x(size_t*& ip, int* x, bool* stopflag) { | |
--(*x); | |
ip++; | |
} | |
void __iop_jump_x_nz(size_t*& ip, int* x, bool* stopflag) { | |
auto target = *(ip + 1); | |
if (*x) { | |
ip = (size_t*) target; | |
} else { | |
ip+=2; | |
} | |
} | |
void __iop_halt(size_t*& ip, int* x, bool* stopflag) { | |
*stopflag = false; | |
} | |
void execute_indirectcall(size_t* code) { | |
void* buf[4]; | |
buf[HALT] = (void*)&__iop_halt; | |
buf[SET_X] = (void*)&__iop_set_x; | |
buf[DEC_X] = (void*)&__iop_dec_x; | |
buf[JUMP_X_NZ] = (void*)&__iop_jump_x_nz; | |
int x = 0; | |
bool stopflag = true; | |
for (; stopflag ;) ((iopcode)buf[*code])(code, &x, &stopflag); | |
} | |
void execute_indirect(size_t* code) | |
{ | |
volatile int x = 0; | |
void* buf[4]; | |
void** data = &buf[0]; | |
STORE(HALT, op_halt); | |
STORE(SET_X, op_set_x); | |
STORE(DEC_X, op_dec_x); | |
STORE(JUMP_X_NZ, op_jump_x_nz); | |
JUMP_INDIRECT(); | |
op_halt: | |
return; | |
op_set_x: | |
x = *((int*)(code++)); | |
JUMP_INDIRECT(); | |
op_dec_x: | |
--x; | |
JUMP_INDIRECT(); | |
op_jump_x_nz: | |
{ | |
auto target = *(code++); | |
if (x) { | |
code = (size_t*)target; | |
} | |
JUMP_INDIRECT(); | |
} | |
} | |
void execute_switch(size_t* data) { | |
volatile int x = 0; | |
auto ip = data; | |
while (true) { | |
auto op = *data; | |
switch (op) { | |
case HALT: | |
return; | |
case SET_X: | |
x = (int)*(data + 1); | |
break; | |
case DEC_X: | |
x--; | |
break; | |
case JUMP_X_NZ: | |
if (x) { | |
data = (size_t*)*(data + 1); | |
continue; | |
} | |
} | |
data++; | |
} | |
} | |
void test_direct() { | |
__volatile__ void* opcodes[4]; | |
__volatile__ void* program[6]; | |
double start_time, end_time; | |
execute_direct(opcodes, GET_OPCODES); | |
program[0] = opcodes[SET_X]; | |
program[1] = (void*)2000000000; | |
program[2] = opcodes[DEC_X]; | |
program[3] = opcodes[JUMP_X_NZ]; | |
program[4] = &program[2]; | |
program[5] = opcodes[HALT]; | |
start_time = time_s(); | |
execute_direct(program, RUN); | |
end_time = time_s(); | |
cout << "Direct threaded done in " << (end_time - start_time) << " seconds." << endl; | |
} | |
void test_directcall() { | |
void* opcodes[4]; | |
void* program[6]; | |
double start_time, end_time; | |
execute_directcall(opcodes, GET_OPCODES); | |
program[0] = opcodes[SET_X]; | |
program[1] = (void*)2000000000; | |
program[2] = opcodes[DEC_X]; | |
program[3] = opcodes[JUMP_X_NZ]; | |
program[4] = &program[2]; | |
program[5] = opcodes[HALT]; | |
start_time = time_s(); | |
execute_directcall(program, RUN); | |
end_time = time_s(); | |
cout << "Direct call threaded done in " << (end_time - start_time) << " seconds." << endl; | |
} | |
void test_indirect() { | |
size_t program[6]; | |
double start_time, end_time; | |
program[0] = SET_X; | |
program[1] = 2000000000; | |
program[2] = DEC_X; | |
program[3] = JUMP_X_NZ; | |
program[4] = (size_t)&program[2]; | |
program[5] = HALT; | |
start_time = time_s(); | |
execute_indirect(program); | |
end_time = time_s(); | |
cout << "Indirect threaded done in " << (end_time - start_time) << " seconds." << endl; | |
} | |
void test_indirectcall() { | |
size_t program[6]; | |
double start_time, end_time; | |
program[0] = SET_X; | |
program[1] = 2000000000; | |
program[2] = DEC_X; | |
program[3] = JUMP_X_NZ; | |
program[4] = (size_t)&program[2]; | |
program[5] = HALT; | |
start_time = time_s(); | |
execute_indirectcall(program); | |
end_time = time_s(); | |
cout << "Indirect call threaded done in " << (end_time - start_time) << " seconds." << endl; | |
} | |
void test_switch() { | |
size_t program[6]; | |
double start_time, end_time; | |
program[0] = SET_X; | |
program[1] = 2000000000; | |
program[2] = DEC_X; | |
program[3] = JUMP_X_NZ; | |
program[4] = (size_t)&program[2]; | |
program[5] = HALT; | |
start_time = time_s(); | |
execute_switch(program); | |
end_time = time_s(); | |
cout << "Switch done in " << (end_time - start_time) << " seconds." << endl; | |
} | |
int main() { | |
test_direct(); | |
test_directcall(); | |
test_indirect(); | |
test_indirectcall(); | |
test_switch(); | |
return 0; | |
} |
$ /opt/intel/bin/icpc -std=c++11 dispatchtest.cxx -O3
$ ./a.out
Direct threaded done in 7.717 seconds.
Direct call threaded done in 15.674 seconds.
Indirect threaded done in 7.518 seconds.
Indirect call threaded done in 16.571 seconds.
Switch done in 6.055 seconds.
$ ./a.out
Direct threaded done in 7.724 seconds.
Direct call threaded done in 15.965 seconds.
Indirect threaded done in 7.409 seconds.
Indirect call threaded done in 16.209 seconds.
Switch done in 5.977 seconds.
$ ./a.out
Direct threaded done in 7.524 seconds.
Direct call threaded done in 15.41 seconds.
Indirect threaded done in 7.333 seconds.
Indirect call threaded done in 16.744 seconds.
Switch done in 6.314 seconds.
Direct threaded done in 0 seconds.
It looks like g++ with -O3 flag simply removes an execution loop from the direct threaded testing code. And it's funny, because even the volatile modifier doesn't help to prevent this kind of behaviour. I'd say that -O3 doesn't make sense since its results are very similar with -O2 flag.
model name : ARMv7 Processor rev 0 (v7l)
BogoMIPS : 3394.86
$ g++ -O2 dispatchtest.cxx -std=c++11
$ ./a.out
Direct threaded done in 12.235 seconds.
Direct call threaded done in 62.327 seconds.
Indirect threaded done in 25.473 seconds.
Indirect call threaded done in 75.263 seconds.
Switch done in 56.447 seconds.
$ ./a.out
Direct threaded done in 12.313 seconds.
Direct call threaded done in 62.379 seconds.
Indirect threaded done in 25.487 seconds.
Indirect call threaded done in 75.353 seconds.
Switch done in 56.522 seconds.
$ ./a.out
Direct threaded done in 12.17 seconds.
Direct call threaded done in 62.329 seconds.
Indirect threaded done in 25.472 seconds.
Indirect call threaded done in 75.267 seconds.
Switch done in 56.45 seconds.
$ /opt/llvm-3.8.0/bin/clang++ -O2 dispatchtest.cxx -std=c++11
dispatchtest.cxx:74:5: error: passing 'volatile void *' to parameter of incompatible type 'const void *'
JUMP();
^~~~~~
dispatchtest.cxx:60:22: note: expanded from macro 'JUMP'
# define JUMP() goto **(ip++)
/opt/llvm-3.8.0/bin/clang++ -O2 dispatchtest.cxx -std=c++11
$ ./a.out
Direct threaded done in 15.064 seconds.
Direct call threaded done in 59.977 seconds.
Indirect threaded done in 27.338 seconds.
Indirect call threaded done in 70.56 seconds.
Switch done in 55.273 seconds.
$ ./a.out
Direct threaded done in 15.029 seconds.
Direct call threaded done in 59.977 seconds.
Indirect threaded done in 27.349 seconds.
Indirect call threaded done in 70.56 seconds.
Switch done in 55.273 seconds.
$ ./a.out
Direct threaded done in 15.083 seconds.
Direct call threaded done in 60.033 seconds.
Indirect threaded done in 27.377 seconds.
Indirect call threaded done in 70.608 seconds.
Switch done in 55.306 seconds.
We don't need volatile for clang, it doesn't like conversion from volatile void* to const void*. I think it's fixed now.
$ icpc -std=c++11 -O3 -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 3.748 seconds.
Direct call threaded done in 11.005 seconds.
Indirect threaded done in 4.951 seconds.
Indirect call threaded done in 11.581 seconds.
Switch done in 6.249 seconds.
$ icpc -std=c++11 -O3 -xW -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 3.751 seconds.
Direct call threaded done in 11.107 seconds.
Indirect threaded done in 4.959 seconds.
Indirect call threaded done in 11.593 seconds.
Switch done in 6.254 seconds.
$ icpc -std=c++11 -O3 -xP -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 3.749 seconds.
Direct call threaded done in 11.012 seconds.
Indirect threaded done in 4.95 seconds.
Indirect call threaded done in 11.011 seconds.
Switch done in 6.249 seconds.
$ icpc -std=c++11 -O3 -xT -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 4.998 seconds.
Direct call threaded done in 10.974 seconds.
Indirect threaded done in 5.235 seconds.
Indirect call threaded done in 10.944 seconds.
Switch done in 6.25 seconds.
$ icpc -std=c++11 -O3 -xS -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 4.998 seconds.
Direct call threaded done in 11.008 seconds.
Indirect threaded done in 5.235 seconds.
Indirect call threaded done in 11.022 seconds.
Switch done in 6.301 seconds.
$ icpc -std=c++11 -O3 -xSSE3_ATOM -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 4.376 seconds.
Direct call threaded done in 12.005 seconds.
Indirect threaded done in 4.388 seconds.
Indirect call threaded done in 10.877 seconds.
Switch done in 9.374 seconds.
$ icpc -std=c++11 -O3 -xK -ipo dispatch.cxx
$ ./a.out
Direct threaded done in 3.749 seconds.
Direct call threaded done in 10.948 seconds.
Indirect threaded done in 4.957 seconds.
Indirect call threaded done in 11.372 seconds.
Switch done in 6.252 seconds.
$ x86_64-w64-mingw32-g++ -static ./dispatchtest.cxx -D_WIN32=1 -std=c++11 -O2
Xeon E5410 @2.33GHz
c:\Tmp>a.exe
Direct threaded done in 7.497 seconds.
Direct call threaded done in 16.963 seconds.
Indirect threaded done in 9.592 seconds.
Indirect call threaded done in 16.017 seconds.
Switch done in 11.215 seconds.
c:\Tmp>a.exe
Direct threaded done in 7.532 seconds.
Direct call threaded done in 17.008 seconds.
Indirect threaded done in 9.643 seconds.
Indirect call threaded done in 15.967 seconds.
Switch done in 11.35 seconds.
c:\Tmp>a.exe
Direct threaded done in 7.579 seconds.
Direct call threaded done in 17.243 seconds.
Indirect threaded done in 9.603 seconds.
Indirect call threaded done in 15.94 seconds.
Switch done in 11.281 seconds.
model name : Intel(R) Core(TM) i7-4702MQ CPU @ 2.20GHz
$ g++ -O3 dispatchtest.cxx -std=c++11
$ ./a.out
Direct threaded done in 0 seconds.
Direct call threaded done in 15.859 seconds.
Indirect threaded done in 7.594 seconds.
Indirect call threaded done in 16.924 seconds.
Switch done in 6.122 seconds.
$ ./a.out
Direct threaded done in 0 seconds.
Direct call threaded done in 16.033 seconds.
Indirect threaded done in 7.57 seconds.
Indirect call threaded done in 16.792 seconds.
Switch done in 6.296 seconds.
$ ./a.out
Direct threaded done in 0 seconds.
Direct call threaded done in 16.099 seconds.
Indirect threaded done in 7.649 seconds.
Indirect call threaded done in 16.983 seconds.
Switch done in 6.29 seconds.
$ clang++ -O3 dispatchtest.cxx -std=c++11
$ ./a.out
Direct threaded done in 5.891 seconds.
Direct call threaded done in 34.885 seconds.
Indirect threaded done in 6.762 seconds.
Indirect call threaded done in 39.831 seconds.
Switch done in 27.484 seconds.
$ ./a.out
Direct threaded done in 5.821 seconds.
Direct call threaded done in 35.662 seconds.
Indirect threaded done in 6.809 seconds.
Indirect call threaded done in 39.82 seconds.
Switch done in 27.662 seconds.
$ ./a.out
Direct threaded done in 5.931 seconds.
Direct call threaded done in 34.912 seconds.
Indirect threaded done in 6.637 seconds.
Indirect call threaded done in 38.887 seconds.
Switch done in 27.691 seconds.