Skip to content

Instantly share code, notes, and snippets.

@pscollins
Last active February 19, 2016 07:56
Show Gist options
  • Save pscollins/3be7853d8c265527110a to your computer and use it in GitHub Desktop.
Save pscollins/3be7853d8c265527110a to your computer and use it in GitHub Desktop.
kira does assembly

The actual disassembly of the two is below for the sake of it existing somewhere, but basically the homequeue version looks like:

START:
result = deq(thing)
if result == 0:
    goto FAIL
packets_processed++
getFingerprint()

FAIL:
unlock()
not_yet_done = checkAlarm()
if not_yet_done != 0:
    goto START
pthread_exit()

but the lockfree version looks like

START:
result = deq(thing)
if result != 0:
    goto SUCCEED
not_yet_done = checkAlarm()
if not_yet_done != 0:
    goto START
pthread_exit()

SUCCEED:
packets_processed++
getFingerprint()
goto START

This is worrying because tl;dr (I don't know if you've covered this, we learned it in architecture) basically the processor has to guess in advance whether or not an if branch is going to be taken, and if it guesses wrong it hurts pretty badly --- it's like a cache miss (but less expensive I think). You can read more here.

Basically it doesn't matter how many ifs you have as long as they follow a regular pattern (i.e. TTTTTTT... or FFFFFFF... or TFTFTF.... etc --- the actual details of what an i7 processor might be able to learn are super proprietary so it's hard to give 100% accurate examples). I have to guess that the homequeue version probably has a more regular pattern to its branching, because locking/releasing will give time for the dispatcher to push a packet on to the queue. Also maybe some atomic magic happens that forces writes from the dispatcher to be visible to the dequeuer more quickly than they otherwise would be.

So maybe the homequeue implementation only gets NULL from deq every 10th call, but the lock free implementation gets NULLs scattered randomly every 2 or 3 calls. Now your branch predictor is sad and your code is slower. I remember Andrew Chien (Intel man) at one point saying that there's a difference statistically between branches like if result == 0: goto FAIL and if result != 0: goto SUCCESS so the fact that gcc has flipped them is worrying.

But this is really nuts to suggest, so I don't know.

Lockfree:

void *work(void *void_args) {
  402083:       53                      push   %rbx
  402084:       48 89 fb                mov    %rdi,%rbx
  402087:       48 83 ec 28             sub    $0x28,%rsp
    /* Waits, spins until */
    workArgs *args = (workArgs*)void_args;
    Alarm alarm = args->alarm;
  40208b:       48 8b 47 08             mov    0x8(%rdi),%rax
  40208f:       48 89 04 24             mov    %rax,(%rsp)
  402093:       48 8b 47 10             mov    0x10(%rdi),%rax
  402097:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  40209c:       48 8b 47 18             mov    0x18(%rdi),%rax
  4020a0:       48 89 44 24 10          mov    %rax,0x10(%rsp)
    long packets_processed = 0;
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
  4020a5:       eb 17                   jmp    4020be <work+0x3e>
  4020a7:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
  4020ae:       00 00 
        if ((tmp = deq(args->q))) {
  4020b0:       48 8b 7b 20             mov    0x20(%rbx),%rdi
  4020b4:       e8 e7 fa ff ff          callq  401ba0 <deq>
  4020b9:       48 85 c0                test   %rax,%rax
  4020bc:       75 1a                   jne    4020d8 <work+0x58>
    Alarm alarm = args->alarm;
    long packets_processed = 0;
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
  4020be:       48 89 e7                mov    %rsp,%rdi
  4020c1:       e8 2a 14 00 00          callq  4034f0 <checkAlarm>
  4020c6:       85 c0                   test   %eax,%eax
  4020c8:       75 e6                   jne    4020b0 <work+0x30>
            packets_processed ++;
        } else {
        }
    }

    pthread_exit((void*)packets_processed);
4020ca:       48 89 ef                mov    %rbp,%rdi
  4020cd:       e8 ae ea ff ff          callq  400b80 <pthread_exit@plt>
  4020d2:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
        if ((tmp = deq(args->q))) {
            fingerprint += getFingerprint(tmp->iterations, tmp->seed);
  4020d8:       48 8b 70 08             mov    0x8(%rax),%rsi
  4020dc:       48 8b 38                mov    (%rax),%rdi
            packets_processed ++;
  4020df:       48 83 c5 01             add    $0x1,%rbp
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
        if ((tmp = deq(args->q))) {
            fingerprint += getFingerprint(tmp->iterations, tmp->seed);
  4020e3:       e8 c8 15 00 00          callq  4036b0 <getFingerprint>
  4020e8:       eb d4                   jmp    4020be <work+0x3e>
  4020ea:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

VS home queue:

void *work_hq(void *void_args) {
  4023b0:       55                      push   %rbp
    /* Waits, spins until */
    workArgsHQ *args = (workArgsHQ*)void_args;
    LockedQueue *lq = args->lq;
    Alarm alarm = args->alarm;
    long packets_processed = 0;
  4023b1:       31 ed                   xor    %ebp,%ebp

    return packets_processed;
}


void *work_hq(void *void_args) {
  4023b3:       53                      push   %rbx
  4023b4:       48 83 ec 28             sub    $0x28,%rsp
    /* Waits, spins until */
    workArgsHQ *args = (workArgsHQ*)void_args;
    LockedQueue *lq = args->lq;
    Alarm alarm = args->alarm;
  4023b8:       48 8b 47 08             mov    0x8(%rdi),%rax


void *work_hq(void *void_args) {
    /* Waits, spins until */
    workArgsHQ *args = (workArgsHQ*)void_args;
    LockedQueue *lq = args->lq;
  4023bc:       48 8b 5f 20             mov    0x20(%rdi),%rbx
    Alarm alarm = args->alarm;
  4023c0:       48 89 04 24             mov    %rax,(%rsp)
  4023c4:       48 8b 47 10             mov    0x10(%rdi),%rax
  4023c8:       48 89 44 24 08          mov    %rax,0x8(%rsp)
  4023cd:       48 8b 47 18             mov    0x18(%rdi),%rax
  4023d1:       48 89 44 24 10          mov    %rax,0x10(%rsp)
    long packets_processed = 0;
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
  4023d6:       eb 39                   jmp    402411 <work_hq+0x61>
  4023d8:       0f 1f 84 00 00 00 00    nopl   0x0(%rax,%rax,1)
  4023df:       00 
        lq->lock->lock(lq->lock);
  4023e0:       48 8b 43 08             mov    0x8(%rbx),%rax
  4023e4:       48 89 c7                mov    %rax,%rdi
  4023e7:       ff 50 08                callq  *0x8(%rax)
        if ((tmp = deq(lq->q))) {
  4023ea:       48 8b 3b                mov    (%rbx),%rdi
  4023ed:       e8 ae f7 ff ff          callq  401ba0 <deq>
  4023f2:       48 85 c0                test   %rax,%rax
  4023f5:       74 10                   je     402407 <work_hq+0x57>
            fingerprint += getFingerprint(tmp->iterations, tmp->seed);
  4023f7:       48 8b 70 08             mov    0x8(%rax),%rsi
  4023fb:       48 8b 38                mov    (%rax),%rdi
            packets_processed ++;
  4023fe:       48 83 c5 01             add    $0x1,%rbp
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
        lq->lock->lock(lq->lock);
        if ((tmp = deq(lq->q))) {
            fingerprint += getFingerprint(tmp->iterations, tmp->seed);
  402402:       e8 a9 12 00 00          callq  4036b0 <getFingerprint>
            packets_processed ++;
        } else {
        }
        lq->lock->unlock(lq->lock);
  402407:       48 8b 43 08             mov    0x8(%rbx),%rax
  40240b:       48 89 c7                mov    %rax,%rdi
  40240e:       ff 50 10                callq  *0x10(%rax)
    Alarm alarm = args->alarm;
    long packets_processed = 0;
    long fingerprint = 0;
    Packet_t *tmp;

    while(checkAlarm(&alarm)) {
  402411:       48 89 e7                mov    %rsp,%rdi
  402414:       e8 d7 10 00 00          callq  4034f0 <checkAlarm>
  402419:       85 c0                   test   %eax,%eax
  40241b:       75 c3                   jne    4023e0 <work_hq+0x30>
        } else {
        }
        lq->lock->unlock(lq->lock);
    }

    pthread_exit((void*)packets_processed);
  40241d:       48 89 ef                mov    %rbp,%rdi
  402420:       e8 5b e7 ff ff          callq  400b80 <pthread_exit@plt>
  402425:       66 66 2e 0f 1f 84 00    data32 nopw %cs:0x0(%rax,%rax,1)
  40242c:       00 00 00 00 
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment