Skip to content

Instantly share code, notes, and snippets.

@RyanMarcus
Created July 13, 2015 21:45
Show Gist options
  • Save RyanMarcus/88293e81d529b6a387c2 to your computer and use it in GitHub Desktop.
Save RyanMarcus/88293e81d529b6a387c2 to your computer and use it in GitHub Desktop.
Monte Carlo Calculator with 0 memory roundtrips
%define loopcount 800000
global main
extern printf
extern srand
extern rand
segment .data
align 16
fmt db '%d counts / %d samples for a value of pi=%f', 10, 0x00
loopfmt db 'Iterations remaining: %d (current sum: %d / %d)', 10, 0x00
val1 dd 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0
val2 dd 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0
invmax dd 9.3132257504915e-10
offset dd 1.0
result dd 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
segment .text
main:
vzeroall ; zero all ymm* registers to make sure we are in a good state
mov r12d, 0 ; reset the sum counter
mov r11, loopcount ; loop counter
mov eax, 42 ; LCG seed
loopStart:
vbroadcastss ymm15, [invmax] ; load invmax into ymm15
vbroadcastss ymm14, [offset] ; load offset into ymm14
mov r8d, 1103515245 ; LCG multiplier
mov r9d, 12345 ; LCG increment
mov r10d, 2147483647 ; modulo
loadRandomIntoYmm1:
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into xmm1
vextractf128 xmm1, ymm0, 0
;; copy the bottom 128 bits in the upper 128 bits
vperm2f128 ymm1, ymm1, ymm1, 1
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into the bottom half of ymm1, but keep the top of ymm1 the same
vperm2f128 ymm1, ymm1, ymm0, 18
loadRandomIntoYmm2:
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into xmm2
vextractf128 xmm2, ymm0, 0
;; copy the bottom 128 bits in the upper 128 bits
vperm2f128 ymm2, ymm2, ymm2, 1
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; now, shift those four words over
vpslldq ymm0, ymm0, 4
mul r8d
add eax, r9d
and eax, r10d
cvtsi2ss xmm0, rax
;; pull the bottom half of ymm0 into the bottom half of ymm12 but keep the top of ymm1 the same
vperm2f128 ymm2, ymm2, ymm0, 18
computeDistanceToOrigin:
;; at this point, ymm1 and ymm2 contain random x and y values. we need to compute the (squared)
;; distance to the origin
vfmsub132ps ymm1, ymm14, ymm15 ; ymm1 = ymm1 * ymm15 - ymm14 (random numbers between 0 and 1)
vfmsub132ps ymm2, ymm14, ymm15 ; ymm2 = ymm2 * ymm15 - ymm14 (random numbers between 0 and 1)
mov r15d, eax ; save the last random seed into r15
;; y = y^2, then x = x * x + y
vmulps ymm2, ymm2, ymm2
vfmadd213ps ymm1, ymm1, ymm2
vcmpgeps ymm1, ymm14 ; compare against the radius, which happens to be equal to the offset
vandnps ymm1, ymm14 ; make positives 1 and negatives 0
vhaddps ymm1, ymm1, ymm1
vhaddps ymm1, ymm1, ymm1
tabulate:
vmovups [result], ymm1 ; move the result into the result array
;; now we need to add [result+0] and [result+16]
cvttss2si eax, [result]
cvttss2si ebx, [result+16]
add eax, ebx
add r12d, eax
dec r11
cmp r11, 0
mov eax, r15d
jne loopStart
printResults:
;; store the total number of loop counts into eax
mov eax, loopcount
mov rdi, 8
mul rdi
mov edx, eax ; store the number of samples in rdx
mov [result], eax
mov [result+4], r12d
;; divide and multiply by 4
finit
fild dword [result+4]
fild dword [result]
fdiv
fadd ST0, ST0
fadd ST0, ST0
fstp dword [result]
push rbp ; preserve rdp on the stack
mov rdi, fmt ; load the format string into the first non-floating argument
mov esi, r12d ; load the sum into the first argument
;; number of samples already loaded into edx
cvtss2sd xmm0, dword [result] ; load value into printf floating point parameter
mov rax, 1 ; 1 floating point argument
call printf ; call printf
pop rbp ; get back the old rbp register
mov rax, 0 ; set return value to 0 for success
ret ; return
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment