RyanMarcus · July 13, 2015 21:45
diff --git a/gistfile1.asm b/gistfile1.asm
 %define loopcount 800000


 global  main                          
 extern  printf                        
 extern 	srand
 extern	 rand
 	                             
 	
 segment .data
 	align 16
 	fmt 	db '%d counts / %d samples for a value of pi=%f', 10, 0x00
 	loopfmt	db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00	
 	val1 	dd  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0
 	val2 	dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
 	invmax  dd  9.3132257504915e-10
 	offset	dd  1.0

 	
 	result  dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

 	
 	
 segment	.text
 main:




 	vzeroall 		; zero all ymm* registers to make sure we are in a good state

 	mov r12d, 0				; reset the sum counter
 	mov r11, loopcount			; loop counter
 	mov eax, 42				; LCG seed
 	


 loopStart:
 	vbroadcastss	ymm15, [invmax] 	; load invmax into ymm15
 	vbroadcastss	ymm14, [offset] 	; load offset into ymm14


 	mov r8d, 1103515245			; LCG multiplier
 	mov r9d, 12345				; LCG increment
 	mov r10d, 2147483647			; modulo

 loadRandomIntoYmm1:	

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	
 	cvtsi2ss xmm0, rax


 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4
 	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; pull the bottom half of ymm0 into xmm1
 	vextractf128 	xmm1, ymm0, 0

 	;; copy the bottom 128 bits in the upper 128 bits
       	vperm2f128	ymm1, ymm1, ymm1, 1

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax
 	
 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4
 	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; pull the bottom half of ymm0 into the bottom half of ymm1, but keep the top of ymm1 the same
 	vperm2f128	ymm1, ymm1, ymm0, 18


 	


 loadRandomIntoYmm2:	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax
 	
 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4
 	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; pull the bottom half of ymm0 into xmm2
 	vextractf128 	xmm2, ymm0, 0

 	;; copy the bottom 128 bits in the upper 128 bits
       	vperm2f128	ymm2, ymm2, ymm2, 1

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax
 	
 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4
 	
 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	;; now, shift those four words over
 	vpslldq	ymm0, ymm0, 4

 	mul	r8d
 	add	eax, r9d
 	and 	eax, r10d
 	cvtsi2ss xmm0, rax

 	
 	;; pull the bottom half of ymm0 into the bottom half of ymm12 but keep the top of ymm1 the same
 	vperm2f128	ymm2, ymm2, ymm0, 18



 	
 computeDistanceToOrigin:
 	;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
 	;; distance to the origin

 	vfmsub132ps	ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
 	vfmsub132ps	ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)

 	
 	mov 	r15d, eax 	; save the last random seed into r15
 	
 	;; y = y^2, then x = x * x + y
 	vmulps	ymm2, ymm2, ymm2
 	vfmadd213ps	ymm1, ymm1, ymm2


 	
 	vcmpgeps ymm1, ymm14 	; compare against the radius, which happens to be equal to the offset
 	vandnps	ymm1, ymm14	; make positives 1 and negatives 0


 	vhaddps ymm1, ymm1, ymm1
 	vhaddps ymm1, ymm1, ymm1
 	
 	
 tabulate:	
 	vmovups	 [result], ymm1   ; move the result into the result array

 	;; now we need to add [result+0] and [result+16]
 	cvttss2si	eax, [result]
 	cvttss2si	ebx, [result+16]
 	add	eax, ebx
 	add 	r12d, eax


 	
 	dec 	r11
 	
 	cmp	r11, 0
 	mov	eax, r15d
 	jne loopStart
 	
 printResults:

 	;; store the total number of loop counts into eax
 	mov	eax, loopcount
 	mov	rdi, 8
 	mul	rdi
 	mov	edx, eax 	; store the number of samples in rdx

 	mov 	[result], eax
 	mov	[result+4], r12d

 	;; divide and multiply by 4
 	finit
 	fild	dword [result+4]
 	fild	dword [result]
 	fdiv
 	fadd	ST0, ST0
 	fadd	ST0, ST0
 	fstp	dword [result]
 	
 	
 	push 	rbp		; preserve rdp on the stack
 	mov 	rdi, fmt	; load the format string into the first non-floating argument
 	mov	esi, r12d	; load the sum into the first argument
 	;;  number of samples already loaded into edx
 	cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter
 		   
 	mov 	 rax, 1 	; 1 floating point argument
 	call 	 printf	; call printf
 	pop 	 rbp		; get back the old rbp register
 		   
 	mov      rax, 0	; set return value to 0 for success
        ret                	; return
	%define loopcount 800000


	global main
	extern printf
	extern srand
	extern rand


	segment .data
	align 16
	fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00
	loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00
	val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
	val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
	invmax dd 9.3132257504915e-10
	offset dd 1.0


	result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0



	segment .text
	main:




	vzeroall ; zero all ymm* registers to make sure we are in a good state

	mov r12d, 0 ; reset the sum counter
	mov r11, loopcount ; loop counter
	mov eax, 42 ; LCG seed



	loopStart:
	vbroadcastss ymm15, [invmax] ; load invmax into ymm15
	vbroadcastss ymm14, [offset] ; load offset into ymm14


	mov r8d, 1103515245 ; LCG multiplier
	mov r9d, 12345 ; LCG increment
	mov r10d, 2147483647 ; modulo

	loadRandomIntoYmm1:

	mul r8d
	add eax, r9d
	and eax, r10d

	cvtsi2ss xmm0, rax


	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; pull the bottom half of ymm0 into xmm1
	vextractf128 xmm1, ymm0, 0

	;; copy the bottom 128 bits in the upper 128 bits
	vperm2f128 ymm1, ymm1, ymm1, 1

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; pull the bottom half of ymm0 into the bottom half of ymm1, but keep the top of ymm1 the same
	vperm2f128 ymm1, ymm1, ymm0, 18





	loadRandomIntoYmm2:
	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; pull the bottom half of ymm0 into xmm2
	vextractf128 xmm2, ymm0, 0

	;; copy the bottom 128 bits in the upper 128 bits
	vperm2f128 ymm2, ymm2, ymm2, 1

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax

	;; now, shift those four words over
	vpslldq ymm0, ymm0, 4

	mul r8d
	add eax, r9d
	and eax, r10d
	cvtsi2ss xmm0, rax


	;; pull the bottom half of ymm0 into the bottom half of ymm12 but keep the top of ymm1 the same
	vperm2f128 ymm2, ymm2, ymm0, 18




	computeDistanceToOrigin:
	;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
	;; distance to the origin

	vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
	vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)


	mov r15d, eax ; save the last random seed into r15

	;; y = y^2, then x = x * x + y
	vmulps ymm2, ymm2, ymm2
	vfmadd213ps ymm1, ymm1, ymm2



	vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset
	vandnps ymm1, ymm14 ; make positives 1 and negatives 0


	vhaddps ymm1, ymm1, ymm1
	vhaddps ymm1, ymm1, ymm1


	tabulate:
	vmovups [result], ymm1 ; move the result into the result array

	;; now we need to add [result+0] and [result+16]
	cvttss2si eax, [result]
	cvttss2si ebx, [result+16]
	add eax, ebx
	add r12d, eax



	dec r11

	cmp r11, 0
	mov eax, r15d
	jne loopStart

	printResults:

	;; store the total number of loop counts into eax
	mov eax, loopcount
	mov rdi, 8
	mul rdi
	mov edx, eax ; store the number of samples in rdx

	mov [result], eax
	mov [result+4], r12d

	;; divide and multiply by 4
	finit
	fild dword [result+4]
	fild dword [result]
	fdiv
	fadd ST0, ST0
	fadd ST0, ST0
	fstp dword [result]


	push rbp ; preserve rdp on the stack
	mov rdi, fmt ; load the format string into the first non-floating argument
	mov esi, r12d ; load the sum into the first argument
	;; number of samples already loaded into edx
	cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter

	mov rax, 1 ; 1 floating point argument
	call printf ; call printf
	pop rbp ; get back the old rbp register

	mov rax, 0 ; set return value to 0 for success
	ret ; return