The actual disassembly of the two is below for the sake of it existing somewhere, but basically the homequeue version looks like:
START:
result = deq(thing)
if result == 0:
goto FAIL
packets_processed++
getFingerprint()
FAIL:
unlock()
not_yet_done = checkAlarm()
if not_yet_done != 0:
goto START
pthread_exit()
but the lockfree version looks like
START:
result = deq(thing)
if result != 0:
goto SUCCEED
not_yet_done = checkAlarm()
if not_yet_done != 0:
goto START
pthread_exit()
SUCCEED:
packets_processed++
getFingerprint()
goto START
This is worrying because tl;dr (I don't know if you've covered this, we learned it in architecture) basically the processor has to guess in advance whether or not an if
branch is going to be taken, and if it guesses wrong it hurts pretty badly --- it's like a cache miss (but less expensive I think). You can read more here.
Basically it doesn't matter how many if
s you have as long as they follow a regular pattern (i.e. TTTTTTT... or FFFFFFF... or TFTFTF.... etc --- the actual details of what an i7 processor might be able to learn are super proprietary so it's hard to give 100% accurate examples). I have to guess that the homequeue version probably has a more regular pattern to its branching, because locking/releasing will give time for the dispatcher to push a packet on to the queue. Also maybe some atomic magic happens that forces writes from the dispatcher to be visible to the dequeuer more quickly than they otherwise would be.
So maybe the homequeue implementation only gets NULL
from deq
every 10th call, but the lock free implementation gets NULL
s scattered randomly every 2 or 3 calls. Now your branch predictor is sad and your code is slower. I remember Andrew Chien (Intel man) at one point saying that there's a difference statistically between branches like if result == 0: goto FAIL
and if result != 0: goto SUCCESS
so the fact that gcc has flipped them is worrying.
But this is really nuts to suggest, so I don't know.
Lockfree:
void *work(void *void_args) {
402083: 53 push %rbx
402084: 48 89 fb mov %rdi,%rbx
402087: 48 83 ec 28 sub $0x28,%rsp
/* Waits, spins until */
workArgs *args = (workArgs*)void_args;
Alarm alarm = args->alarm;
40208b: 48 8b 47 08 mov 0x8(%rdi),%rax
40208f: 48 89 04 24 mov %rax,(%rsp)
402093: 48 8b 47 10 mov 0x10(%rdi),%rax
402097: 48 89 44 24 08 mov %rax,0x8(%rsp)
40209c: 48 8b 47 18 mov 0x18(%rdi),%rax
4020a0: 48 89 44 24 10 mov %rax,0x10(%rsp)
long packets_processed = 0;
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
4020a5: eb 17 jmp 4020be <work+0x3e>
4020a7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
4020ae: 00 00
if ((tmp = deq(args->q))) {
4020b0: 48 8b 7b 20 mov 0x20(%rbx),%rdi
4020b4: e8 e7 fa ff ff callq 401ba0 <deq>
4020b9: 48 85 c0 test %rax,%rax
4020bc: 75 1a jne 4020d8 <work+0x58>
Alarm alarm = args->alarm;
long packets_processed = 0;
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
4020be: 48 89 e7 mov %rsp,%rdi
4020c1: e8 2a 14 00 00 callq 4034f0 <checkAlarm>
4020c6: 85 c0 test %eax,%eax
4020c8: 75 e6 jne 4020b0 <work+0x30>
packets_processed ++;
} else {
}
}
pthread_exit((void*)packets_processed);
4020ca: 48 89 ef mov %rbp,%rdi
4020cd: e8 ae ea ff ff callq 400b80 <pthread_exit@plt>
4020d2: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
if ((tmp = deq(args->q))) {
fingerprint += getFingerprint(tmp->iterations, tmp->seed);
4020d8: 48 8b 70 08 mov 0x8(%rax),%rsi
4020dc: 48 8b 38 mov (%rax),%rdi
packets_processed ++;
4020df: 48 83 c5 01 add $0x1,%rbp
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
if ((tmp = deq(args->q))) {
fingerprint += getFingerprint(tmp->iterations, tmp->seed);
4020e3: e8 c8 15 00 00 callq 4036b0 <getFingerprint>
4020e8: eb d4 jmp 4020be <work+0x3e>
4020ea: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
VS home queue:
void *work_hq(void *void_args) {
4023b0: 55 push %rbp
/* Waits, spins until */
workArgsHQ *args = (workArgsHQ*)void_args;
LockedQueue *lq = args->lq;
Alarm alarm = args->alarm;
long packets_processed = 0;
4023b1: 31 ed xor %ebp,%ebp
return packets_processed;
}
void *work_hq(void *void_args) {
4023b3: 53 push %rbx
4023b4: 48 83 ec 28 sub $0x28,%rsp
/* Waits, spins until */
workArgsHQ *args = (workArgsHQ*)void_args;
LockedQueue *lq = args->lq;
Alarm alarm = args->alarm;
4023b8: 48 8b 47 08 mov 0x8(%rdi),%rax
void *work_hq(void *void_args) {
/* Waits, spins until */
workArgsHQ *args = (workArgsHQ*)void_args;
LockedQueue *lq = args->lq;
4023bc: 48 8b 5f 20 mov 0x20(%rdi),%rbx
Alarm alarm = args->alarm;
4023c0: 48 89 04 24 mov %rax,(%rsp)
4023c4: 48 8b 47 10 mov 0x10(%rdi),%rax
4023c8: 48 89 44 24 08 mov %rax,0x8(%rsp)
4023cd: 48 8b 47 18 mov 0x18(%rdi),%rax
4023d1: 48 89 44 24 10 mov %rax,0x10(%rsp)
long packets_processed = 0;
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
4023d6: eb 39 jmp 402411 <work_hq+0x61>
4023d8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4023df: 00
lq->lock->lock(lq->lock);
4023e0: 48 8b 43 08 mov 0x8(%rbx),%rax
4023e4: 48 89 c7 mov %rax,%rdi
4023e7: ff 50 08 callq *0x8(%rax)
if ((tmp = deq(lq->q))) {
4023ea: 48 8b 3b mov (%rbx),%rdi
4023ed: e8 ae f7 ff ff callq 401ba0 <deq>
4023f2: 48 85 c0 test %rax,%rax
4023f5: 74 10 je 402407 <work_hq+0x57>
fingerprint += getFingerprint(tmp->iterations, tmp->seed);
4023f7: 48 8b 70 08 mov 0x8(%rax),%rsi
4023fb: 48 8b 38 mov (%rax),%rdi
packets_processed ++;
4023fe: 48 83 c5 01 add $0x1,%rbp
Packet_t *tmp;
while(checkAlarm(&alarm)) {
lq->lock->lock(lq->lock);
if ((tmp = deq(lq->q))) {
fingerprint += getFingerprint(tmp->iterations, tmp->seed);
402402: e8 a9 12 00 00 callq 4036b0 <getFingerprint>
packets_processed ++;
} else {
}
lq->lock->unlock(lq->lock);
402407: 48 8b 43 08 mov 0x8(%rbx),%rax
40240b: 48 89 c7 mov %rax,%rdi
40240e: ff 50 10 callq *0x10(%rax)
Alarm alarm = args->alarm;
long packets_processed = 0;
long fingerprint = 0;
Packet_t *tmp;
while(checkAlarm(&alarm)) {
402411: 48 89 e7 mov %rsp,%rdi
402414: e8 d7 10 00 00 callq 4034f0 <checkAlarm>
402419: 85 c0 test %eax,%eax
40241b: 75 c3 jne 4023e0 <work_hq+0x30>
} else {
}
lq->lock->unlock(lq->lock);
}
pthread_exit((void*)packets_processed);
40241d: 48 89 ef mov %rbp,%rdi
402420: e8 5b e7 ff ff callq 400b80 <pthread_exit@plt>
402425: 66 66 2e 0f 1f 84 00 data32 nopw %cs:0x0(%rax,%rax,1)
40242c: 00 00 00 00