RyanMarcus · July 13, 2015 21:46
diff --git a/gistfile1.txt b/gistfile1.txt
 %define loopcount 800000


 global  main                          
 extern  printf                        
 extern 	srand
 extern	 rand
 	                             
 	
 segment .data

 	string 	db 'Hello world!', 10, 0x00      
 	fmt 	db '%d counts / %d samples for a value of pi=%f', 10, 0x00
 	loopfmt	db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00	
 	val1 	dd  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0
 	val2 	dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
 	;;   1/2147483647  * 2, the inverse of rand max times two
 	invmax  dd  9.3132257504915e-10
 	offset	dd  1.0

 	align 32	
 	result   dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

 	
 	
 segment	.text
 main:

 	;; push	rbp		
        ;; mov     rdi, string
 	;; mov     rax, 0                  ; set this to 0 because we are passing 0 arguments to printf's ...		
        ;; call    printf		
 	;; pop 	rbp

 	;; fld	qword [val1]
 	;; fsqrt
 	;; fst	qword [result]


 	
 	;; push	rbp
 	;; mov 	rdi, 42
 	;; call	srand
 	;; pop rbp


 	vzeroall 		; zero all ymm* registers to make sure we are in a good state

 	mov r12d, 0				; reset the sum counter
 	mov r11, loopcount			; loop counter
 	mov eax, 42				; LCG seed
 	

 	vbroadcastss	ymm15, [invmax] 	; load invmax into ymm15
 	vbroadcastss	ymm14, [offset] 	; load offset into ymm14


 	mov r8d, 1103515245			; LCG multiplier
 	mov r9d, 12345				; LCG increment
 	mov r10d, 2147483647			; modulo


 loopStart:

 loadRandomIntoYmm1:	

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm1, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm2, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm3, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm4, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm5, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm6, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm7, rax

 	vmovss [result], xmm0
 	vmovss [result+4], xmm1
 	vmovss [result+8], xmm2
 	vmovss [result+12], xmm3
 	vmovss [result+16], xmm4
 	vmovss [result+20], xmm5
 	vmovss [result+24], xmm6
 	vmovss [result+28], xmm7


 	vmovaps	ymm1, [result]
 	
 loadRandomIntoYmm2:	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm1, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm2, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm3, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm4, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm5, rax

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm6, rax
 	mul	r8d

 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm7, rax

 	vmovss [result], xmm0
 	vmovss [result+4], xmm1
 	vmovss [result+8], xmm2
 	vmovss [result+12], xmm3
 	vmovss [result+16], xmm4
 	vmovss [result+20], xmm5
 	vmovss [result+24], xmm6
 	vmovss [result+28], xmm7
 	
 	vmovaps	ymm2, [result]

 	




 	
 computeDistanceToOrigin:
 	;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
 	;; distance to the origin

 	vfmsub132ps	ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
 	vfmsub132ps	ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)

 	
 	mov 	r15d, eax 	; save the last random seed into r15
 	
 	;; y = y^2, then x = x * x + y
 	vmulps	ymm2, ymm2, ymm2
 	vfmadd213ps	ymm1, ymm1, ymm2


 	
 	vcmpgeps ymm1, ymm14 	; compare against the radius, which happens to be equal to the offset
 	vandnps	ymm1, ymm14	; make positives 1 and negatives 0


 	vhaddps ymm1, ymm1, ymm1
 	vhaddps ymm1, ymm1, ymm1
 	
 	
 tabulate:	
 	vmovaps	 [result], ymm1   ; move the result into the result array

 	;; now we need to add [result+0] and [result+16]
 	cvttss2si	eax, [result]
 	cvttss2si	ebx, [result+16]
 	add	eax, ebx
 	add 	r12d, eax


 	;; push 	r12
 	;; push	rax
 	;; push 	r11
 	;; push 	rbp
 	
 	;; mov 	rdi, loopfmt
 	;; mov	esi, r11d
 	;; mov 	edx, r12d
 	;; mov	ecx, eax
 	;; mov	rax, 0
 	
 	;; call	printf
 	
 	;; pop 	rbp
 	;; pop 	r11
 	;; pop 	rax
 	;; pop	r12
 	
 	dec 	r11
 	
 	cmp	r11, 0
 	mov	eax, r15d
 	jne loopStart
 	
 printResults:

 	;; store the total number of loop counts into eax
 	mov	eax, loopcount
 	mov	rdi, 8
 	mul	rdi
 	mov	edx, eax 	; store the number of samples in rdx

 	mov 	[result], eax
 	mov	[result+4], r12d

 	;; divide and multiply by 4
 	finit
 	fild	dword [result+4]
 	fild	dword [result]
 	fdiv
 	fadd	ST0, ST0
 	fadd	ST0, ST0
 	fstp	dword [result]
 	
 	
 	push 	rbp		; preserve rdp on the stack
 	mov 	rdi, fmt	; load the format string into the first non-floating argument
 	mov	esi, r12d	; load the sum into the first argument
 	;;  number of samples already loaded into edx
 	cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter
 		   
 	mov 	 rax, 1 	; 1 floating point argument
 	call 	 printf	; call printf
 	pop 	 rbp		; get back the old rbp register
 		   
 	mov      rax, 0	; set return value to 0 for success
        ret                	; return
	%define loopcount 800000


	global main
	extern printf
	extern srand
	extern rand


	segment .data

	string db 'Hello world!', 10, 0x00
	fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00
	loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00
	val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
	val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
	;; 1/2147483647 * 2, the inverse of rand max times two
	invmax dd 9.3132257504915e-10
	offset dd 1.0

	align 32
	result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0



	segment .text
	main:

	;; push rbp
	;; mov rdi, string
	;; mov rax, 0 ; set this to 0 because we are passing 0 arguments to printf's ...
	;; call printf
	;; pop rbp

	;; fld qword [val1]
	;; fsqrt
	;; fst qword [result]



	;; push rbp
	;; mov rdi, 42
	;; call srand
	;; pop rbp


	vzeroall ; zero all ymm* registers to make sure we are in a good state

	mov r12d, 0 ; reset the sum counter
	mov r11, loopcount ; loop counter
	mov eax, 42 ; LCG seed


	vbroadcastss ymm15, [invmax] ; load invmax into ymm15
	vbroadcastss ymm14, [offset] ; load offset into ymm14


	mov r8d, 1103515245 ; LCG multiplier
	mov r9d, 12345 ; LCG increment
	mov r10d, 2147483647 ; modulo


	loopStart:

	loadRandomIntoYmm1:

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm1, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm2, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm3, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm4, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm5, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm6, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm7, rax

	vmovss [result], xmm0
	vmovss [result+4], xmm1
	vmovss [result+8], xmm2
	vmovss [result+12], xmm3
	vmovss [result+16], xmm4
	vmovss [result+20], xmm5
	vmovss [result+24], xmm6
	vmovss [result+28], xmm7


	vmovaps ymm1, [result]

	loadRandomIntoYmm2:
	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm1, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm2, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm3, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm4, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm5, rax

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm6, rax
	mul r8d

	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm7, rax

	vmovss [result], xmm0
	vmovss [result+4], xmm1
	vmovss [result+8], xmm2
	vmovss [result+12], xmm3
	vmovss [result+16], xmm4
	vmovss [result+20], xmm5
	vmovss [result+24], xmm6
	vmovss [result+28], xmm7

	vmovaps ymm2, [result]







	computeDistanceToOrigin:
	;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
	;; distance to the origin

	vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
	vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)


	mov r15d, eax ; save the last random seed into r15

	;; y = y^2, then x = x * x + y
	vmulps ymm2, ymm2, ymm2
	vfmadd213ps ymm1, ymm1, ymm2



	vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset
	vandnps ymm1, ymm14 ; make positives 1 and negatives 0


	vhaddps ymm1, ymm1, ymm1
	vhaddps ymm1, ymm1, ymm1


	tabulate:
	vmovaps [result], ymm1 ; move the result into the result array

	;; now we need to add [result+0] and [result+16]
	cvttss2si eax, [result]
	cvttss2si ebx, [result+16]
	add eax, ebx
	add r12d, eax


	;; push r12
	;; push rax
	;; push r11
	;; push rbp

	;; mov rdi, loopfmt
	;; mov esi, r11d
	;; mov edx, r12d
	;; mov ecx, eax
	;; mov rax, 0

	;; call printf

	;; pop rbp
	;; pop r11
	;; pop rax
	;; pop r12

	dec r11

	cmp r11, 0
	mov eax, r15d
	jne loopStart

	printResults:

	;; store the total number of loop counts into eax
	mov eax, loopcount
	mov rdi, 8
	mul rdi
	mov edx, eax ; store the number of samples in rdx

	mov [result], eax
	mov [result+4], r12d

	;; divide and multiply by 4
	finit
	fild dword [result+4]
	fild dword [result]
	fdiv
	fadd ST0, ST0
	fadd ST0, ST0
	fstp dword [result]


	push rbp ; preserve rdp on the stack
	mov rdi, fmt ; load the format string into the first non-floating argument
	mov esi, r12d ; load the sum into the first argument
	;; number of samples already loaded into edx
	cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter

	mov rax, 1 ; 1 floating point argument
	call printf ; call printf
	pop rbp ; get back the old rbp register

	mov rax, 0 ; set return value to 0 for success
	ret ; return