Skip to content

Instantly share code, notes, and snippets.

@RyanMarcus
Created July 13, 2015 21:46
Show Gist options
  • Save RyanMarcus/8809e1b775d3bdb5228a to your computer and use it in GitHub Desktop.
Save RyanMarcus/8809e1b775d3bdb5228a to your computer and use it in GitHub Desktop.
Monte Carlo Calculator with no vector crossings
%define loopcount 800000
global main
extern printf
extern srand
extern rand
segment .data
string db 'Hello world!', 10, 0x00
fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00
loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00
val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
;; 1/2147483647 * 2, the inverse of rand max times two
invmax dd 9.3132257504915e-10
offset dd 1.0
align 32
result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
segment .text
main:
;; push rbp
;; mov rdi, string
;; mov rax, 0 ; set this to 0 because we are passing 0 arguments to printf's ...
;; call printf
;; pop rbp
;; fld qword [val1]
;; fsqrt
;; fst qword [result]
;; push rbp
;; mov rdi, 42
;; call srand
;; pop rbp
vzeroall ; zero all ymm* registers to make sure we are in a good state
mov r12d, 0 ; reset the sum counter
mov r11, loopcount ; loop counter
mov eax, 42 ; LCG seed
vbroadcastss ymm15, [invmax] ; load invmax into ymm15
vbroadcastss ymm14, [offset] ; load offset into ymm14
mov r8d, 1103515245 ; LCG multiplier
mov r9d, 12345 ; LCG increment
mov r10d, 2147483647 ; modulo
loopStart:
loadRandomIntoYmm1:
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm1, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm2, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm3, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm4, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm5, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm6, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm7, rax
vmovss [result], xmm0
vmovss [result+4], xmm1
vmovss [result+8], xmm2
vmovss [result+12], xmm3
vmovss [result+16], xmm4
vmovss [result+20], xmm5
vmovss [result+24], xmm6
vmovss [result+28], xmm7
vmovaps ymm1, [result]
loadRandomIntoYmm2:
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm1, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm2, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm3, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm4, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm5, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm6, rax
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm7, rax
vmovss [result], xmm0
vmovss [result+4], xmm1
vmovss [result+8], xmm2
vmovss [result+12], xmm3
vmovss [result+16], xmm4
vmovss [result+20], xmm5
vmovss [result+24], xmm6
vmovss [result+28], xmm7
vmovaps ymm2, [result]
computeDistanceToOrigin:
;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
;; distance to the origin
vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)
mov r15d, eax ; save the last random seed into r15
;; y = y^2, then x = x * x + y
vmulps ymm2, ymm2, ymm2
vfmadd213ps ymm1, ymm1, ymm2
vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset
vandnps ymm1, ymm14 ; make positives 1 and negatives 0
vhaddps ymm1, ymm1, ymm1
vhaddps ymm1, ymm1, ymm1
tabulate:
vmovaps [result], ymm1 ; move the result into the result array
;; now we need to add [result+0] and [result+16]
cvttss2si eax, [result]
cvttss2si ebx, [result+16]
add eax, ebx
add r12d, eax
;; push r12
;; push rax
;; push r11
;; push rbp
;; mov rdi, loopfmt
;; mov esi, r11d
;; mov edx, r12d
;; mov ecx, eax
;; mov rax, 0
;; call printf
;; pop rbp
;; pop r11
;; pop rax
;; pop r12
dec r11
cmp r11, 0
mov eax, r15d
jne loopStart
printResults:
;; store the total number of loop counts into eax
mov eax, loopcount
mov rdi, 8
mul rdi
mov edx, eax ; store the number of samples in rdx
mov [result], eax
mov [result+4], r12d
;; divide and multiply by 4
finit
fild dword [result+4]
fild dword [result]
fdiv
fadd ST0, ST0
fadd ST0, ST0
fstp dword [result]
push rbp ; preserve rdp on the stack
mov rdi, fmt ; load the format string into the first non-floating argument
mov esi, r12d ; load the sum into the first argument
;; number of samples already loaded into edx
cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter
mov rax, 1 ; 1 floating point argument
call printf ; call printf
pop rbp ; get back the old rbp register
mov rax, 0 ; set return value to 0 for success
ret ; return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment