Created
July 13, 2015 21:45
-
-
Save RyanMarcus/88293e81d529b6a387c2 to your computer and use it in GitHub Desktop.
Monte Carlo Calculator with 0 memory roundtrips
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%define loopcount 800000 | |
global main | |
extern printf | |
extern srand | |
extern rand | |
segment .data | |
align 16 | |
fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00 | |
loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00 | |
val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 | |
val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0 | |
invmax dd 9.3132257504915e-10 | |
offset dd 1.0 | |
result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
segment .text | |
main: | |
vzeroall ; zero all ymm* registers to make sure we are in a good state | |
mov r12d, 0 ; reset the sum counter | |
mov r11, loopcount ; loop counter | |
mov eax, 42 ; LCG seed | |
loopStart: | |
vbroadcastss ymm15, [invmax] ; load invmax into ymm15 | |
vbroadcastss ymm14, [offset] ; load offset into ymm14 | |
mov r8d, 1103515245 ; LCG multiplier | |
mov r9d, 12345 ; LCG increment | |
mov r10d, 2147483647 ; modulo | |
loadRandomIntoYmm1: | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; pull the bottom half of ymm0 into xmm1 | |
vextractf128 xmm1, ymm0, 0 | |
;; copy the bottom 128 bits in the upper 128 bits | |
vperm2f128 ymm1, ymm1, ymm1, 1 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; pull the bottom half of ymm0 into the bottom half of ymm1, but keep the top of ymm1 the same | |
vperm2f128 ymm1, ymm1, ymm0, 18 | |
loadRandomIntoYmm2: | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; pull the bottom half of ymm0 into xmm2 | |
vextractf128 xmm2, ymm0, 0 | |
;; copy the bottom 128 bits in the upper 128 bits | |
vperm2f128 ymm2, ymm2, ymm2, 1 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; now, shift those four words over | |
vpslldq ymm0, ymm0, 4 | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
;; pull the bottom half of ymm0 into the bottom half of ymm12 but keep the top of ymm1 the same | |
vperm2f128 ymm2, ymm2, ymm0, 18 | |
computeDistanceToOrigin: | |
;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared) | |
;; distance to the origin | |
vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1) | |
vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1) | |
mov r15d, eax ; save the last random seed into r15 | |
;; y = y^2, then x = x * x + y | |
vmulps ymm2, ymm2, ymm2 | |
vfmadd213ps ymm1, ymm1, ymm2 | |
vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset | |
vandnps ymm1, ymm14 ; make positives 1 and negatives 0 | |
vhaddps ymm1, ymm1, ymm1 | |
vhaddps ymm1, ymm1, ymm1 | |
tabulate: | |
vmovups [result], ymm1 ; move the result into the result array | |
;; now we need to add [result+0] and [result+16] | |
cvttss2si eax, [result] | |
cvttss2si ebx, [result+16] | |
add eax, ebx | |
add r12d, eax | |
dec r11 | |
cmp r11, 0 | |
mov eax, r15d | |
jne loopStart | |
printResults: | |
;; store the total number of loop counts into eax | |
mov eax, loopcount | |
mov rdi, 8 | |
mul rdi | |
mov edx, eax ; store the number of samples in rdx | |
mov [result], eax | |
mov [result+4], r12d | |
;; divide and multiply by 4 | |
finit | |
fild dword [result+4] | |
fild dword [result] | |
fdiv | |
fadd ST0, ST0 | |
fadd ST0, ST0 | |
fstp dword [result] | |
push rbp ; preserve rdp on the stack | |
mov rdi, fmt ; load the format string into the first non-floating argument | |
mov esi, r12d ; load the sum into the first argument | |
;; number of samples already loaded into edx | |
cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter | |
mov rax, 1 ; 1 floating point argument | |
call printf ; call printf | |
pop rbp ; get back the old rbp register | |
mov rax, 0 ; set return value to 0 for success | |
ret ; return | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment