lydonchandra · January 1, 2016 08:29
diff --git a/parameter passing pointer and __m256.cpp b/parameter passing pointer and __m256.cpp
 void test_parameterPassing_pointer( double *d1p, 
                                    __m256 *m256_1p, __m128 *m128_1p, 
                                    __m256 m256_1) {

 	double mul1 = (*d1p) + (*d1p);
 	
 	double *m256_dp = ((double*)m256_1p);
 	double *m128_dp = ((double*)m128_1p);
 	
 	double mul2 = m256_dp[0] * mul1 - m128_dp[1];
 	double div1 = ((double*)&m256_1)[2] / mul2;
 	
 	printf("mul2=%f\n", div1);
 }


 In OSX/Linux x64 and using gcc, all parameters are passed using registers.
 Pointer parameters are treated like normal parameters and hence passed using regular %rdi, %rsi, %rdx, %rcx, %r8, %r9
 __m256, which is 32bytes in size, is passed through %ymm register, as can be seen below:

 _test_parameterPassing_pointer:         ## @test_parameterPassing_pointer
 	.cfi_startproc
 ## BB#0:
 	pushq	%rbp
 Ltmp24:
 	.cfi_def_cfa_offset 16
 Ltmp25:
 	.cfi_offset %rbp, -16
 	movq	%rsp, %rbp
 Ltmp26:
 	.cfi_def_cfa_register %rbp
 	vmovsd	(%rdi), %xmm1
 	vaddsd	%xmm1, %xmm1, %xmm1
 	vmulsd	(%rsi), %xmm1, %xmm1
 	vsubsd	8(%rdx), %xmm1, %xmm1
 	vextractf128	$1, %ymm0, %xmm0	# <<---- our __m256 param
 	//xmm0 == ymm0[128:255]
 	
 	vpshufd	$1, %xmm0, %xmm2        ## xmm2 = xmm0[1,0,0,0]
 	
 	vinsertps	$16, %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
 	//$16 == 0001 0000
 	//source 00 = 0, %xmm2[31:0]
 	//dest	 01 = 1, %xmm0[63:32]
 	//mask 0000 
 	//xmm0 == xmm0[127:96], xmm0[95:64], xmm2[31:0], xmm0[31:0]
 	
 	vdivsd	%xmm1, %xmm0, %xmm0
 	leaq	L_.str5(%rip), %rdi
 	movb	$1, %al
 	popq	%rbp
 	vzeroupper
 	jmp	_printf                 ## TAILCALL
 	



 In Windows x64, the standard __fastcall applies, which means only up to 4 parameters are passed using registers.
 Our last param, m256_1 above, is passed using reference, not value (because it is more than 64 bits long)
 test_parameterPassing_pointer	PROC 
 ; parameter 1(d1p): rcx
 ; parameter 2(m256_1p): rdx
 ; parameter 3(m128_1p): r8
 ; parameter 4(m256_1): [r9]
 .B3.1::                         ; Preds .B3.0

 ;;; void test_parameterPassing_pointer(double *d1p, __m256 *m256_1p, __m128 *m128_1p, __m256 m256_1) {

 $LN290:
  00000 48 83 ec 28      sub rsp, 40                            ; main.c:148.98
 $LN291:

 ;;; 
 ;;; 	double mul1 = (*d1p) + (*d1p);

  00004 f2 0f 10 01      movsd xmm0, QWORD PTR [rcx]            ; main.c:150.18
 $LN292:

 ;;; 	double *m256_dp = ((double*)m256_1p);
 ;;; 	double *m128_dp = ((double*)m128_1p);
 ;;; 	double mul2 = m256_dp[0] * mul1 - m128_dp[1];
 ;;; 	double div1 = ((double*)&m256_1)[2] / mul2;
 ;;; 	printf("mul2=%f\n", div1);

  00008 48 8d 0d 00 00 
        00 00            lea rcx, QWORD PTR [??_C@_08A@mul2?$DN?$CFf?6?$AA@] ; main.c:155.2
 $LN293:
  0000f f2 41 0f 10 49 
        10               movsd xmm1, QWORD PTR [16+r9]          ; main.c:155.2 <<----- our __m256 param
 $LN294:
  00015 f2 0f 58 c0      addsd xmm0, xmm0                       ; main.c:155.2
 $LN295:
  00019 f2 0f 59 02      mulsd xmm0, QWORD PTR [rdx]            ; main.c:155.2
 $LN296:
  0001d 4c 89 4c 24 48   mov QWORD PTR [72+rsp], r9             ; main.c:148.98  
 $LN297:
  00022 f2 41 0f 5c 40 
        08               subsd xmm0, QWORD PTR [8+r8]           ; main.c:155.2
 $LN298:
  00028 f2 0f 5e c8      divsd xmm1, xmm0                       ; main.c:155.2
 $LN299:
  0002c 66 48 0f 7e ca   movd rdx, xmm1                         ; main.c:155.2
 $LN300:
  00031 ff 15 fc ff ff 
        ff               call QWORD PTR [__imp_printf]          ; main.c:155.2
 $LN301:
                                ; LOE rbx rbp rsi rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
 .B3.2::                         ; Preds .B3.1

 ;;; }
	void test_parameterPassing_pointer( double *d1p,
	__m256 m256_1p, __m128 m128_1p,
	__m256 m256_1) {

	double mul1 = (d1p) + (d1p);

	double m256_dp = ((double)m256_1p);
	double m128_dp = ((double)m128_1p);

	double mul2 = m256_dp[0] * mul1 - m128_dp[1];
	double div1 = ((double*)&m256_1)[2] / mul2;

	printf("mul2=%f\n", div1);
	}


	In OSX/Linux x64 and using gcc, all parameters are passed using registers.
	Pointer parameters are treated like normal parameters and hence passed using regular %rdi, %rsi, %rdx, %rcx, %r8, %r9
	__m256, which is 32bytes in size, is passed through %ymm register, as can be seen below:

	_test_parameterPassing_pointer: ## @test_parameterPassing_pointer
	.cfi_startproc
	## BB#0:
	pushq %rbp
	Ltmp24:
	.cfi_def_cfa_offset 16
	Ltmp25:
	.cfi_offset %rbp, -16
	movq %rsp, %rbp
	Ltmp26:
	.cfi_def_cfa_register %rbp
	vmovsd (%rdi), %xmm1
	vaddsd %xmm1, %xmm1, %xmm1
	vmulsd (%rsi), %xmm1, %xmm1
	vsubsd 8(%rdx), %xmm1, %xmm1
	vextractf128 $1, %ymm0, %xmm0 # <<---- our __m256 param
	//xmm0 == ymm0[128:255]

	vpshufd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0,0,0]

	vinsertps $16, %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
	//$16 == 0001 0000
	//source 00 = 0, %xmm2[31:0]
	//dest 01 = 1, %xmm0[63:32]
	//mask 0000
	//xmm0 == xmm0[127:96], xmm0[95:64], xmm2[31:0], xmm0[31:0]

	vdivsd %xmm1, %xmm0, %xmm0
	leaq L_.str5(%rip), %rdi
	movb $1, %al
	popq %rbp
	vzeroupper
	jmp _printf ## TAILCALL




	In Windows x64, the standard __fastcall applies, which means only up to 4 parameters are passed using registers.
	Our last param, m256_1 above, is passed using reference, not value (because it is more than 64 bits long)
	test_parameterPassing_pointer PROC
	; parameter 1(d1p): rcx
	; parameter 2(m256_1p): rdx
	; parameter 3(m128_1p): r8
	; parameter 4(m256_1): [r9]
	.B3.1:: ; Preds .B3.0

	;;; void test_parameterPassing_pointer(double d1p, __m256 m256_1p, __m128 *m128_1p, __m256 m256_1) {

	$LN290:
	00000 48 83 ec 28 sub rsp, 40 ; main.c:148.98
	$LN291:

	;;;
	;;; double mul1 = (d1p) + (d1p);

	00004 f2 0f 10 01 movsd xmm0, QWORD PTR [rcx] ; main.c:150.18
	$LN292:

	;;; double m256_dp = ((double)m256_1p);
	;;; double m128_dp = ((double)m128_1p);
	;;; double mul2 = m256_dp[0] * mul1 - m128_dp[1];
	;;; double div1 = ((double*)&m256_1)[2] / mul2;
	;;; printf("mul2=%f\n", div1);

	00008 48 8d 0d 00 00
	00 00 lea rcx, QWORD PTR [??_C@_08A@mul2?$DN?$CFf?6?$AA@] ; main.c:155.2
	$LN293:
	0000f f2 41 0f 10 49
	10 movsd xmm1, QWORD PTR [16+r9] ; main.c:155.2 <<----- our __m256 param
	$LN294:
	00015 f2 0f 58 c0 addsd xmm0, xmm0 ; main.c:155.2
	$LN295:
	00019 f2 0f 59 02 mulsd xmm0, QWORD PTR [rdx] ; main.c:155.2
	$LN296:
	0001d 4c 89 4c 24 48 mov QWORD PTR [72+rsp], r9 ; main.c:148.98
	$LN297:
	00022 f2 41 0f 5c 40
	08 subsd xmm0, QWORD PTR [8+r8] ; main.c:155.2
	$LN298:
	00028 f2 0f 5e c8 divsd xmm1, xmm0 ; main.c:155.2
	$LN299:
	0002c 66 48 0f 7e ca movd rdx, xmm1 ; main.c:155.2
	$LN300:
	00031 ff 15 fc ff ff
	ff call QWORD PTR [__imp_printf] ; main.c:155.2
	$LN301:
	; LOE rbx rbp rsi rdi r12 r13 r14 r15 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
	.B3.2:: ; Preds .B3.1

	;;; }