Created
July 13, 2015 21:46
-
-
Save RyanMarcus/8809e1b775d3bdb5228a to your computer and use it in GitHub Desktop.
Monte Carlo Calculator with no vector crossings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%define loopcount 800000 | |
global main | |
extern printf | |
extern srand | |
extern rand | |
segment .data | |
string db 'Hello world!', 10, 0x00 | |
fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00 | |
loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00 | |
val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0 | |
val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0 | |
;; 1/2147483647 * 2, the inverse of rand max times two | |
invmax dd 9.3132257504915e-10 | |
offset dd 1.0 | |
align 32 | |
result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
segment .text | |
main: | |
;; push rbp | |
;; mov rdi, string | |
;; mov rax, 0 ; set this to 0 because we are passing 0 arguments to printf's ... | |
;; call printf | |
;; pop rbp | |
;; fld qword [val1] | |
;; fsqrt | |
;; fst qword [result] | |
;; push rbp | |
;; mov rdi, 42 | |
;; call srand | |
;; pop rbp | |
vzeroall ; zero all ymm* registers to make sure we are in a good state | |
mov r12d, 0 ; reset the sum counter | |
mov r11, loopcount ; loop counter | |
mov eax, 42 ; LCG seed | |
vbroadcastss ymm15, [invmax] ; load invmax into ymm15 | |
vbroadcastss ymm14, [offset] ; load offset into ymm14 | |
mov r8d, 1103515245 ; LCG multiplier | |
mov r9d, 12345 ; LCG increment | |
mov r10d, 2147483647 ; modulo | |
loopStart: | |
loadRandomIntoYmm1: | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm1, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm2, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm3, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm4, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm5, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm6, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm7, rax | |
vmovss [result], xmm0 | |
vmovss [result+4], xmm1 | |
vmovss [result+8], xmm2 | |
vmovss [result+12], xmm3 | |
vmovss [result+16], xmm4 | |
vmovss [result+20], xmm5 | |
vmovss [result+24], xmm6 | |
vmovss [result+28], xmm7 | |
vmovaps ymm1, [result] | |
loadRandomIntoYmm2: | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm0, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm1, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm2, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm3, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm4, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm5, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm6, rax | |
mul r8d | |
add eax, r9d | |
and eax, r10d | |
cvtsi2ss xmm7, rax | |
vmovss [result], xmm0 | |
vmovss [result+4], xmm1 | |
vmovss [result+8], xmm2 | |
vmovss [result+12], xmm3 | |
vmovss [result+16], xmm4 | |
vmovss [result+20], xmm5 | |
vmovss [result+24], xmm6 | |
vmovss [result+28], xmm7 | |
vmovaps ymm2, [result] | |
computeDistanceToOrigin: | |
;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared) | |
;; distance to the origin | |
vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1) | |
vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1) | |
mov r15d, eax ; save the last random seed into r15 | |
;; y = y^2, then x = x * x + y | |
vmulps ymm2, ymm2, ymm2 | |
vfmadd213ps ymm1, ymm1, ymm2 | |
vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset | |
vandnps ymm1, ymm14 ; make positives 1 and negatives 0 | |
vhaddps ymm1, ymm1, ymm1 | |
vhaddps ymm1, ymm1, ymm1 | |
tabulate: | |
vmovaps [result], ymm1 ; move the result into the result array | |
;; now we need to add [result+0] and [result+16] | |
cvttss2si eax, [result] | |
cvttss2si ebx, [result+16] | |
add eax, ebx | |
add r12d, eax | |
;; push r12 | |
;; push rax | |
;; push r11 | |
;; push rbp | |
;; mov rdi, loopfmt | |
;; mov esi, r11d | |
;; mov edx, r12d | |
;; mov ecx, eax | |
;; mov rax, 0 | |
;; call printf | |
;; pop rbp | |
;; pop r11 | |
;; pop rax | |
;; pop r12 | |
dec r11 | |
cmp r11, 0 | |
mov eax, r15d | |
jne loopStart | |
printResults: | |
;; store the total number of loop counts into eax | |
mov eax, loopcount | |
mov rdi, 8 | |
mul rdi | |
mov edx, eax ; store the number of samples in rdx | |
mov [result], eax | |
mov [result+4], r12d | |
;; divide and multiply by 4 | |
finit | |
fild dword [result+4] | |
fild dword [result] | |
fdiv | |
fadd ST0, ST0 | |
fadd ST0, ST0 | |
fstp dword [result] | |
push rbp ; preserve rdp on the stack | |
mov rdi, fmt ; load the format string into the first non-floating argument | |
mov esi, r12d ; load the sum into the first argument | |
;; number of samples already loaded into edx | |
cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter | |
mov rax, 1 ; 1 floating point argument | |
call printf ; call printf | |
pop rbp ; get back the old rbp register | |
mov rax, 0 ; set return value to 0 for success | |
ret ; return | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment