nddrylliog · August 29, 2015 14:02 · nddrylliog · May 31, 2014
diff --git a/00_fun.c b/00_fun.c
 #include <assert.h>
 #include <stdio.h>

 float takes_a_vec3(float *v) {
    return v[0]+v[1]+v[2];
 }

 float takes_three_floats(float x, float y, float z) {
    // look ma, no hands!
    return takes_a_vec3(&z);
 }

 int main(int argc, char **argv) {
    // passes. why? i know why. do you? :D
    assert(takes_three_floats(1,2,3) == 6);
    puts("All good.");

    return 0;
 }
diff --git a/01_prereqs.md b/01_prereqs.md
diff --git a/fun-O0.S b/fun-O0.S
 fun:
 (__TEXT,__text) section

 # float takes_a_vec3(float *v)
 _takes_a_vec3:
 ## prelude - save %rbp (current stack frame)
 0000000100000e20	pushq	%rbp
 0000000100000e21	movq	%rsp, %rbp
 ## save %rdi (pointer register, our argument) on the stack!
 0000000100000e24	movq	%rdi, -0x8(%rbp)
 ## ..nevermind, load it back, we need it (typical non-optimized code)
 0000000100000e28	movq	-0x8(%rbp), %rdi
 ## take the first float pointed to by %rdi, put it in %xmm0 (floating point register)
 0000000100000e2c	movss	(%rdi), %xmm0
 ## save %rdi on the stack for some reason...
 0000000100000e30	movq	-0x8(%rbp), %rdi
 ## adding the second float pointed to by %rdi into %xmm0 (so, 1.0 + 2.0)
 0000000100000e34	addss	0x4(%rdi), %xmm0
 ## unnecessary saving again
 0000000100000e39	movq	-0x8(%rbp), %rdi
 ## and adding the third element (3.0 + 3.0)
 0000000100000e3d	addss	0x8(%rdi), %xmm0
 ## restore %rbp
 0000000100000e42	popq	%rbp
 ## and return. the result is in %xmm0, which is expected because the function
 ## returns a float (and we return by register here)
 0000000100000e43	ret
 ## for some reason many archs insist on clearing rax, so let's clear it.
 0000000100000e44	nopw	%cs:(%rax,%rax)

 # float takes_three_floats(float x, float y, float z)
 _takes_three_floats:
 ## prelude - stave rbp
 0000000100000e50	pushq	%rbp
 0000000100000e51	movq	%rsp, %rbp
 ## allocate 16 bytes to store our local variables in.
 ## the stack grows down - we subtract from %rsp to allocate
 0000000100000e54	subq	$0x10, %rsp
 ## first arg is at -0x4(%rbp), second is at -0x8, third is at -0xc
 ## take the address of the third arg, put it in %rdi
 0000000100000e58	leaq	-0xc(%rbp), %rdi
 ## for some reason, put our argument in the first three floating point registers.
 0000000100000e5c	movss	%xmm0, -0x4(%rbp)
 0000000100000e61	movss	%xmm1, -0x8(%rbp)
 0000000100000e66	movss	%xmm2, -0xc(%rbp)
 ## call takes_a_vec3, which will only use %rdi.
 0000000100000e6b	callq	_takes_a_vec3
 ## free memory, restore stack frame, return.
 ## return value is in %xmm0, which was modified in takes_a_vec3.
 0000000100000e70	addq	$0x10, %rsp
 0000000100000e74	popq	%rbp
 0000000100000e75	ret
 0000000100000e76	nopw	%cs:(%rax,%rax)

 # main - you should know your prototypes!
 _main:
 ## same shit, different prelude
 0000000100000e80	pushq	%rbp
 0000000100000e81	movq	%rsp, %rbp
 ## allocate 32 bytes (jolly!)
 0000000100000e84	subq	$0x20, %rsp
 ## @paniq used integers, so 'integer 1' is moved into the %rax register
 0000000100000e88	movabsq	$0x1, %rax
 ## and promptly converted to single-precision floating point, put into %xmm0
 0000000100000e92	cvtsi2ssq	%rax, %xmm0
 ## same goes for 'integer 2' and 'integer 3'.
 0000000100000e97	movabsq	$0x2, %rax
 0000000100000ea1	cvtsi2ssq	%rax, %xmm1
 0000000100000ea6	movabsq	$0x3, %rax
 0000000100000eb0	cvtsi2ssq	%rax, %xmm2
 ## prepare a variable that'll store the result,
 ## we'll store in the first (-0x4(%rbp)) stack slot we ahve
 0000000100000eb5	movl	$0x0, -0x4(%rbp)
 ## save %edi and %rsi in the second and third slots
 0000000100000ebc	movl	%edi, -0x8(%rbp)
 0000000100000ebf	movq	%rsi, -0x10(%rbp)
 ## finally, call!
 0000000100000ec3	callq	_takes_three_floats
 ## at this point, we have the result in %xmm0
 ## move 'integer 6' into %rax, convert it, store it in %xmm1
 0000000100000ec8	movabsq	$0x6, %rax
 0000000100000ed2	cvtsi2ssq	%rax, %xmm1
 ## compare %xmm0 (function return value) and %xmm1 (6.0f constant)
 0000000100000ed7	ucomiss	%xmm1, %xmm0
 ## this is convoluted code to test the value ucomiss returned
 ## basically, if the two values are equal, some flags will be set
 ## to 1, 0, 0 - this code sets %rax to 0 if it's true.
 0000000100000eda	sete	%cl
 0000000100000edd	setnp	%dl
 0000000100000ee0	andb	%cl, %dl
 0000000100000ee2	xorb	$0x1, %dl
 0000000100000ee5	andb	$0x1, %dl
 0000000100000ee8	movzbl	%dl, %edi
 0000000100000eeb	movslq	%edi, %rax
 ## if %rax is 0...
 0000000100000eee	cmpq	$0x0, %rax
 ## ...then jump directly to the printf call
 0000000100000ef4	je	0x100000f19
 ## otherwise, prepare to call ___assert_rtn with info on
 ## where the assert was that failed so the programmer knows.
 0000000100000efa	leaq	0x6b(%rip), %rdi ## literal pool for: "main"
 0000000100000f01	leaq	0x69(%rip), %rsi ## literal pool for: "fun.c"
 0000000100000f08	movl	$0xf, %edx
 0000000100000f0d	leaq	0x63(%rip), %rcx ## literal pool for: "takes_three_floats(1,2,3) == 6"
 0000000100000f14	callq	0x100000f3a ## symbol stub for: ___assert_rtn
 0000000100000f19	jmpq	0x100000f1e
 ## that's where we jumped if they were equal
 0000000100000f1e	leaq	0x71(%rip), %rdi ## literal pool for: "All good."
 0000000100000f25	callq	0x100000f40 ## symbol stub for: _puts
 ## and we're good! *phew*
 0000000100000f2a	movl	$0x0, %ecx
 0000000100000f2f	movl	%eax, -0x14(%rbp)
 0000000100000f32	movl	%ecx, %eax
 0000000100000f34	addq	$0x20, %rsp
 0000000100000f38	popq	%rbp
 0000000100000f39	ret
diff --git a/fun-O1.s b/fun-O1.s
 fun:
 (__TEXT,__text) section

 # float takes_a_vec3(float *v)
 _takes_a_vec3:
 ## prelude, yadda yadda
 0000000100000ea0	pushq	%rbp
 0000000100000ea1	movq	%rsp, %rbp
 ## take the first float argument, store it in single-precision FP register %xmm0
 0000000100000ea4	movss	(%rdi), %xmm0
 ## add the second argument to it
 0000000100000ea8	addss	0x4(%rdi), %xmm0
 ## then the third argument to that
 0000000100000ead	addss	0x8(%rdi), %xmm0
 ## then return - sum is in %xmm0
 0000000100000eb2	popq	%rbp
 0000000100000eb3	ret
 0000000100000eb4	nopw	%cs:(%rax,%rax)

 # float takes_three_floats(float x, float y, float z)
 _takes_three_floats:
 0000000100000ec0	pushq	%rbp
 0000000100000ec1	movq	%rsp, %rbp
 ## allocate 16 bytes
 0000000100000ec4	subq	$0x10, %rsp
 ## arguments were passed through %xmm0, %xmm1, %xmm2 registers,
 ## not through the stack! hence, '%xmm2' is our 'float z' here.
 ## take it and store it in our first 'local variable' stack slot
 0000000100000ec8	movss	%xmm2, -0x4(%rbp)
 ## take the address of that local variable, store it in the pointer register
 0000000100000ecd	leaq	-0x4(%rbp), %rdi
 ## .. you know where this is going. _takes_a_vec3 now has the address
 ## of a local variable (float, equal to 3.0f) - and there's *nothing good*
 ## near that local variable on the stack. Definitely not 2.0f and 1.0f.
 ## in fact, the program should crash soon after - but it doesn't, because
 ## we don't go outside our program's virtual memory area. It just reads garbage
 ## instead and the result of the addition is (most likely) not 6.0f.
 0000000100000ed1	callq	_takes_a_vec3
 ## free memory & return.
 0000000100000ed6	addq	$0x10, %rsp
 0000000100000eda	popq	%rbp
 0000000100000edb	ret
 0000000100000edc	nopl	(%rax)

 # main(blah)
 _main:
 ## prelude
 0000000100000ee0	pushq	%rbp
 0000000100000ee1	movq	%rsp, %rbp
 ## don't even bother allocating any memory this time - all registers baby
 ## look at this compiler. It knows you're not using the first two arguments (x and y)
 ## so it'll pass only the third, into %xmm2. Note that it didn't change take_three_float's prototype.
 ## if it wasn't in the same module, it wouldn't know that and would be forced to pass all three arguments,
 ## not knowing what's inside. In fact, moving _take_three_floats to another compilation unit would force
 ## it to pass all args, as long as no LTO (link-time optimization) is enabled.
 0000000100000ee4	movss	0x78(%rip), %xmm2
 0000000100000eec	callq	_takes_three_floats
 ## well, *that* compare is going to fail...
 0000000100000ef1	ucomiss	0x70(%rip), %xmm0
 ## hence, this jump *will* happen
 0000000100000ef8	jne	0x100000efc
 0000000100000efa	jnp	0x100000f24
 ## and we get a nice error message.
 0000000100000efc	leaq	0x69(%rip), %rax ## literal pool for: "main"
 0000000100000f03	leaq	0x67(%rip), %rcx ## literal pool for: "fun.c"
 0000000100000f0a	leaq	0x66(%rip), %r8 ## literal pool for: "takes_three_floats(1,2,3) == 6"
 0000000100000f11	movl	$0xf, %edx
 0000000100000f16	movq	%rax, %rdi
 0000000100000f19	movq	%rcx, %rsi
 0000000100000f1c	movq	%r8, %rcx
 0000000100000f1f	callq	0x100000f34 ## symbol stub for: ___assert_rtn
 0000000100000f24	leaq	0x6b(%rip), %rdi ## literal pool for: "All good."
 0000000100000f2b	callq	0x100000f3a ## symbol stub for: _puts
 0000000100000f30	xorl	%eax, %eax
 0000000100000f32	popq	%rbp
 0000000100000f33	ret
diff --git a/fun-O2.S b/fun-O2.S
 fun:
 (__TEXT,__text) section

 # float takes_a_vec3(float *v)
 _takes_a_vec3:
 ## this one is sensibly similar to the O1 version.
 ## no stack allocation, just pure register goodness.
 0000000100000ea0	pushq	%rbp
 0000000100000ea1	movq	%rsp, %rbp
 0000000100000ea4	movss	(%rdi), %xmm0
 0000000100000ea8	addss	0x4(%rdi), %xmm0
 0000000100000ead	addss	0x8(%rdi), %xmm0
 0000000100000eb2	popq	%rbp
 0000000100000eb3	ret
 0000000100000eb4	nopw	%cs:(%rax,%rax)

 # float takes_three_floats(float x, float y, float z)
 _takes_three_floats:
 ## okay, this one is fun. prelude
 0000000100000ec0	pushq	%rbp
 0000000100000ec1	movq	%rsp, %rbp
 ## %xmm0 is x, %xmm1 is y, %xmm2 is z
 ## z = z + x = 3.0 + 1.0 = 4.0
 0000000100000ec4	addss	%xmm0, %xmm2
 ## z = z + x = 4.0 + 1.0 = 5.0
 0000000100000ec8	addss	%xmm0, %xmm2
 ## x = z
 0000000100000ecc	movaps	%xmm2, %xmm0
 ## return x
 ##
 ## ...
 ##
 ## Wait, WHAT?
 ## we didn't call takes_a_vec3 because we've inlined it.
 ## it didn't crash because constant folding worked as if everything was passed on the stack
 ## ..except not because the result is 5.0 where it should be 6.0.
 ## so I have no idea what it did here. (keep reading)
 0000000100000ecf	popq	%rbp
 0000000100000ed0	ret
 0000000100000ed1	nopw	%cs:(%rax,%rax)

 # main
 ## but none of this matters! because main calls neither takes_a_vec3
 ## nor does it call takes_three_floats
 _main:
 ## prelude, yada yada
 0000000100000ee0	pushq	%rbp
 0000000100000ee1	movq	%rsp, %rbp
 ## let's put some FP constant in %xmm0
 0000000100000ee4	addss	0x78(%rip), %xmm0
 ## let's add it to itself (effectively multiplying it by 2)
 0000000100000eec	addss	%xmm0, %xmm0
 ## then let's compare it to some other constant
 0000000100000ef0	ucomiss	0x71(%rip), %xmm0
 ## if it's not good, jump to ___assert_rtn stuff
 0000000100000ef7	jne	0x100000efb
 ## otherwise jump to all good
 0000000100000ef9	jnp	0x100000f23
 0000000100000efb	leaq	0x6a(%rip), %rax ## literal pool for: "main"
 0000000100000f02	leaq	0x68(%rip), %rcx ## literal pool for: "fun.c"
 0000000100000f09	leaq	0x67(%rip), %r8 ## literal pool for: "takes_three_floats(1,2,3) == 6"
 0000000100000f10	movl	$0xf, %edx
 0000000100000f15	movq	%rax, %rdi
 0000000100000f18	movq	%rcx, %rsi
 0000000100000f1b	movq	%r8, %rcx
 0000000100000f1e	callq	0x100000f34 ## symbol stub for: ___assert_rtn
 ## aaaaand we're good.
 0000000100000f23	leaq	0x6c(%rip), %rdi ## literal pool for: "All good."
 0000000100000f2a	callq	0x100000f3a ## symbol stub for: _puts
 0000000100000f2f	xorl	%eax, %eax
 0000000100000f31	popq	%rbp
 0000000100000f32	ret
diff --git a/fun-Z_epilogue.md b/fun-Z_epilogue.md
	#include <assert.h>
	#include <stdio.h>

	float takes_a_vec3(float *v) {
	return v[0]+v[1]+v[2];
	}

	float takes_three_floats(float x, float y, float z) {
	// look ma, no hands!
	return takes_a_vec3(&z);
	}

	int main(int argc, char **argv) {
	// passes. why? i know why. do you? :D
	assert(takes_three_floats(1,2,3) == 6);
	puts("All good.");

	return 0;
	}
	fun:
	(__TEXT,__text) section

	# float takes_a_vec3(float *v)
	_takes_a_vec3:
	## prelude - save %rbp (current stack frame)
	0000000100000e20 pushq %rbp
	0000000100000e21 movq %rsp, %rbp
	## save %rdi (pointer register, our argument) on the stack!
	0000000100000e24 movq %rdi, -0x8(%rbp)
	## ..nevermind, load it back, we need it (typical non-optimized code)
	0000000100000e28 movq -0x8(%rbp), %rdi
	## take the first float pointed to by %rdi, put it in %xmm0 (floating point register)
	0000000100000e2c movss (%rdi), %xmm0
	## save %rdi on the stack for some reason...
	0000000100000e30 movq -0x8(%rbp), %rdi
	## adding the second float pointed to by %rdi into %xmm0 (so, 1.0 + 2.0)
	0000000100000e34 addss 0x4(%rdi), %xmm0
	## unnecessary saving again
	0000000100000e39 movq -0x8(%rbp), %rdi
	## and adding the third element (3.0 + 3.0)
	0000000100000e3d addss 0x8(%rdi), %xmm0
	## restore %rbp
	0000000100000e42 popq %rbp
	## and return. the result is in %xmm0, which is expected because the function
	## returns a float (and we return by register here)
	0000000100000e43 ret
	## for some reason many archs insist on clearing rax, so let's clear it.
	0000000100000e44 nopw %cs:(%rax,%rax)

	# float takes_three_floats(float x, float y, float z)
	_takes_three_floats:
	## prelude - stave rbp
	0000000100000e50 pushq %rbp
	0000000100000e51 movq %rsp, %rbp
	## allocate 16 bytes to store our local variables in.
	## the stack grows down - we subtract from %rsp to allocate
	0000000100000e54 subq $0x10, %rsp
	## first arg is at -0x4(%rbp), second is at -0x8, third is at -0xc
	## take the address of the third arg, put it in %rdi
	0000000100000e58 leaq -0xc(%rbp), %rdi
	## for some reason, put our argument in the first three floating point registers.
	0000000100000e5c movss %xmm0, -0x4(%rbp)
	0000000100000e61 movss %xmm1, -0x8(%rbp)
	0000000100000e66 movss %xmm2, -0xc(%rbp)
	## call takes_a_vec3, which will only use %rdi.
	0000000100000e6b callq _takes_a_vec3
	## free memory, restore stack frame, return.
	## return value is in %xmm0, which was modified in takes_a_vec3.
	0000000100000e70 addq $0x10, %rsp
	0000000100000e74 popq %rbp
	0000000100000e75 ret
	0000000100000e76 nopw %cs:(%rax,%rax)

	# main - you should know your prototypes!
	_main:
	## same shit, different prelude
	0000000100000e80 pushq %rbp
	0000000100000e81 movq %rsp, %rbp
	## allocate 32 bytes (jolly!)
	0000000100000e84 subq $0x20, %rsp
	## @paniq used integers, so 'integer 1' is moved into the %rax register
	0000000100000e88 movabsq $0x1, %rax
	## and promptly converted to single-precision floating point, put into %xmm0
	0000000100000e92 cvtsi2ssq %rax, %xmm0
	## same goes for 'integer 2' and 'integer 3'.
	0000000100000e97 movabsq $0x2, %rax
	0000000100000ea1 cvtsi2ssq %rax, %xmm1
	0000000100000ea6 movabsq $0x3, %rax
	0000000100000eb0 cvtsi2ssq %rax, %xmm2
	## prepare a variable that'll store the result,
	## we'll store in the first (-0x4(%rbp)) stack slot we ahve
	0000000100000eb5 movl $0x0, -0x4(%rbp)
	## save %edi and %rsi in the second and third slots
	0000000100000ebc movl %edi, -0x8(%rbp)
	0000000100000ebf movq %rsi, -0x10(%rbp)
	## finally, call!
	0000000100000ec3 callq _takes_three_floats
	## at this point, we have the result in %xmm0
	## move 'integer 6' into %rax, convert it, store it in %xmm1
	0000000100000ec8 movabsq $0x6, %rax
	0000000100000ed2 cvtsi2ssq %rax, %xmm1
	## compare %xmm0 (function return value) and %xmm1 (6.0f constant)
	0000000100000ed7 ucomiss %xmm1, %xmm0
	## this is convoluted code to test the value ucomiss returned
	## basically, if the two values are equal, some flags will be set
	## to 1, 0, 0 - this code sets %rax to 0 if it's true.
	0000000100000eda sete %cl
	0000000100000edd setnp %dl
	0000000100000ee0 andb %cl, %dl
	0000000100000ee2 xorb $0x1, %dl
	0000000100000ee5 andb $0x1, %dl
	0000000100000ee8 movzbl %dl, %edi
	0000000100000eeb movslq %edi, %rax
	## if %rax is 0...
	0000000100000eee cmpq $0x0, %rax
	## ...then jump directly to the printf call
	0000000100000ef4 je 0x100000f19
	## otherwise, prepare to call ___assert_rtn with info on
	## where the assert was that failed so the programmer knows.
	0000000100000efa leaq 0x6b(%rip), %rdi ## literal pool for: "main"
	0000000100000f01 leaq 0x69(%rip), %rsi ## literal pool for: "fun.c"
	0000000100000f08 movl $0xf, %edx
	0000000100000f0d leaq 0x63(%rip), %rcx ## literal pool for: "takes_three_floats(1,2,3) == 6"
	0000000100000f14 callq 0x100000f3a ## symbol stub for: ___assert_rtn
	0000000100000f19 jmpq 0x100000f1e
	## that's where we jumped if they were equal
	0000000100000f1e leaq 0x71(%rip), %rdi ## literal pool for: "All good."
	0000000100000f25 callq 0x100000f40 ## symbol stub for: _puts
	## and we're good! phew
	0000000100000f2a movl $0x0, %ecx
	0000000100000f2f movl %eax, -0x14(%rbp)
	0000000100000f32 movl %ecx, %eax
	0000000100000f34 addq $0x20, %rsp
	0000000100000f38 popq %rbp
	0000000100000f39 ret