Last active
September 17, 2017 10:47
-
-
Save mratsim/ea8c3fe07aaeffd41a9bb7fbdf2a1d16 to your computer and use it in GitHub Desktop.
Call cuda from Nim (alternative version)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Even easier, without the cu / cuh | |
| ## Note VScode properly syntax highlight the emit part, yeah! | |
| import nimcuda/[cuda_runtime_api, driver_types, nimcuda] | |
| import sequtils, future | |
| type GpuArray[T: SomeReal] = object | |
| data: ref[ptr T] | |
| len: int | |
| {.emit: """ | |
| __global__ void square(float * d_out, float * d_in){ | |
| int idx = threadIdx.x; | |
| float f = d_in[idx]; | |
| d_out[idx] = f * f; | |
| } | |
| void cuda_square(int bpg, int tpb, float * d_out, float * d_in){ | |
| square<<<bpg,tpb>>>(d_out, d_in); | |
| } | |
| """.} | |
| proc cuda_square(bpg, tpb: cint, y: ptr cfloat, x: ptr cfloat) {.importc.} | |
| ## Compute the square of x and store it in y | |
| ## bpg: BlocksPerGrid | |
| ## tpb: ThreadsPerBlock | |
| proc cudaMalloc[T](size: int): ptr T {.noSideEffect.}= | |
| let s = size * sizeof(T) | |
| check cudaMalloc(cast[ptr pointer](addr result), s) | |
| proc deallocCuda[T](p: ref[ptr T]) {.noSideEffect.}= | |
| if not p[].isNil: | |
| check cudaFree(p[]) | |
| proc newGpuArray[T: SomeReal](len: int): GpuArray[T] {.noSideEffect.}= | |
| new(result.data, deallocCuda) | |
| result.len = len | |
| result.data[] = cudaMalloc[T](result.len) | |
| proc cuda[T:SomeReal](s: seq[T]): GpuArray[T] {.noSideEffect.}= | |
| result = newGpuArray[T](s.len) | |
| let size = result.len * sizeof(T) | |
| check cudaMemCpy(result.data[], | |
| unsafeAddr s[0], | |
| size, | |
| cudaMemcpyHostToDevice) | |
| proc cpu[T:SomeReal](g: GpuArray[T]): seq[T] {.noSideEffect.}= | |
| result = newSeq[T](g.len) | |
| let size = result.len * sizeof(T) | |
| check cudaMemCpy(addr result[0], | |
| g.data[], | |
| size, | |
| cudaMemcpyDeviceToHost) | |
| proc main() = | |
| let a = newSeq[float32](64) | |
| let b = toSeq(0..63).map(x => x.float32) | |
| echo a | |
| echo b | |
| var u = a.cuda | |
| let v = b.cuda | |
| cuda_square(1.cint, 64.cint, u.data[],v.data[]) | |
| check cudaDeviceSynchronize() | |
| let z = u.cpu | |
| echo z | |
| main() | |
| ## Output: | |
| # @[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | |
| # @[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0] | |
| # @[0.0, 1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0, 100.0, 121.0, 144.0, 169.0, 196.0, 225.0, 256.0, 289.0, 324.0, 361.0, 400.0, 441.0, 484.0, 529.0, 576.0, 625.0, 676.0, 729.0, 784.0, 841.0, 900.0, 961.0, 1024.0, 1089.0, 1156.0, 1225.0, 1296.0, 1369.0, 1444.0, 1521.0, 1600.0, 1681.0, 1764.0, 1849.0, 1936.0, 2025.0, 2116.0, 2209.0, 2304.0, 2401.0, 2500.0, 2601.0, 2704.0, 2809.0, 2916.0, 3025.0, 3136.0, 3249.0, 3364.0, 3481.0, 3600.0, 3721.0, 3844.0, 3969.0] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment