Monte Carlo Calculator with 0 memory roundtrips
%define loopcount 800000
global main
extern printf
extern srand
extern rand
segment .data
align 16
fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00
loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00
val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
invmax dd 9.3132257504915e-10
offset dd 1.0
result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
segment .text
vzeroall ; zero all ymm* registers to make sure we are in a good state
mov r12d, 0 ; reset the sum counter
mov r11, loopcount ; loop counter
mov eax, 42 ; LCG seed
vbroadcastss ymm15, [invmax] ; load invmax into ymm15
vbroadcastss ymm14, [offset] ; load offset into ymm14
mov r8d, 1103515245 ; LCG multiplier
mov r9d, 12345 ; LCG increment
mov r10d, 2147483647 ; modulo
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into xmm1
vextractf128 xmm1, ymm0, 0
;; copy the bottom 128 bits in the upper 128 bits
vperm2f128 ymm1, ymm1, ymm1, 1
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into the bottom half of ymm1, but keep the top of ymm1 the same
vperm2f128 ymm1, ymm1, ymm0, 18
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into xmm2
vextractf128 xmm2, ymm0, 0
;; copy the bottom 128 bits in the upper 128 bits
vperm2f128 ymm2, ymm2, ymm2, 1
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into the bottom half of ymm12 but keep the top of ymm1 the same
vperm2f128 ymm2, ymm2, ymm0, 18
;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
;; distance to the origin
vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)
mov r15d, eax ; save the last random seed into r15
;; y = y^2, then x = x * x + y
vmulps ymm2, ymm2, ymm2
vfmadd213ps ymm1, ymm1, ymm2
vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset
vandnps ymm1, ymm14 ; make positives 1 and negatives 0
vhaddps ymm1, ymm1, ymm1
vhaddps ymm1, ymm1, ymm1
vmovups [result], ymm1 ; move the result into the result array
;; now we need to add [result+0] and [result+16]
cvttss2si eax, [result]
cvttss2si ebx, [result+16]
add eax, ebx
add r12d, eax
dec r11
cmp r11, 0
mov eax, r15d
jne loopStart
;; store the total number of loop counts into eax
mov eax, loopcount
mov rdi, 8
mul rdi
mov edx, eax ; store the number of samples in rdx
mov [result], eax
mov [result+4], r12d
;; divide and multiply by 4
fild dword [result+4]
fild dword [result]
fadd ST0, ST0
fadd ST0, ST0
fstp dword [result]
push rbp ; preserve rdp on the stack
mov rdi, fmt ; load the format string into the first non-floating argument
mov esi, r12d ; load the sum into the first argument
;; number of samples already loaded into edx
cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter
mov rax, 1 ; 1 floating point argument
call printf ; call printf
pop rbp ; get back the old rbp register
mov rax, 0 ; set return value to 0 for success
ret ; return
