Created
December 15, 2022 15:51
-
-
Save mratsim/8929b7f9c414e39c34a365a7552c252c to your computer and use it in GitHub Desktop.
JIT compile Cuda kernel in Nim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Constantine | |
# Copyright (c) 2018-2019 Status Research & Development GmbH | |
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy | |
# Licensed and distributed under either of | |
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). | |
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). | |
# at your option. This file may not be copied, modified, or distributed except according to those terms. | |
{.passc: gorge("llvm-config --cflags").} | |
{.passl: gorge("llvm-config --libs").} | |
# ############################################################ | |
# | |
# Bindings to LLVM JIT | |
# | |
# ############################################################ | |
# https://llvm.org/doxygen/group__LLVMC.html | |
# Constantine is a library. It is possible that applications relying on Constantine | |
# also link to libLLVM, for example if they implement a virtual machine (for the EVM, for Snarks/zero-knowledge, ...). | |
# Hence Constantine should always use LLVM context to "namespace" its own codegen and avoid collisions in the global context. | |
# ############################################################ | |
# | |
# LLVM | |
# | |
# ############################################################ | |
# TODO: by using the dynlib pragma (https://nim-lang.org/docs/manual.html#foreign-function-interface-dynlib-pragma-for-import) | |
# we wouldn't need the LLVM dev packages, only the runtime, significantly easing installation and install size. | |
# However there wouldn't be headers, openArray arguments would need a wrapper. | |
type | |
LlvmBool* = distinct int32 | |
MemoryBufferRef* {.importc: "LLVMMemoryBufferRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
ContextRef* {.importc: "LLVMContextRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
ModuleRef* {.importc: "LLVMModuleRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
TargetRef* {.importc: "LLVMTargetRef", header: "<llvm-c/Types.h>".} = distinct pointer | |
ExecutionEngineRef* {.importc: "LLVMExecutionEngineRef", header: "<llvm-c/ExecutionEngine.h>".} = distinct pointer | |
TypeRef* {.importc: "LLVMTypeRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
ValueRef* {.importc: "LLVMValueRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
NamedMDNodeRef* {.importc: "LLVMNamedMDNodeRef", header: "<llvm-c/Types.h>".} = distinct pointer | |
MetadataRef* {.importc: "LLVMMetadataRef", header: "<llvm-c/Types.h>".} = distinct pointer | |
{.push header: "<llvm-c/Core.h>".} | |
proc createContext*(): ContextRef {.importc: "LLVMContextCreate".} | |
proc dispose*(ctx: ContextRef) {.importc: "LLVMContextDispose".} | |
proc dispose*(msg: cstring) {.importc: "LLVMDisposeMessage".} | |
## cstring in LLVM are owned by LLVM and must be destroyed with a specific function | |
proc dispose*(buf: MemoryBufferRef){.importc: "LLVMDisposeMemoryBuffer".} | |
proc getBufferStart*(buf: MemoryBufferRef): ptr byte {.importc: "LLVMGetBufferStart".} | |
proc getBufferSize*(buf: MemoryBufferRef): csize_t {.importc: "LLVMGetBufferSize".} | |
{.pop.} # {.push header: "<llvm-c/Core.h>".} | |
# ############################################################ | |
# | |
# Module | |
# | |
# ############################################################ | |
{.push header: "<llvm-c/Core.h>".} | |
proc llvmCreateModule(name: cstring, ctx: ContextRef): ModuleRef {.importc: "LLVMModuleCreateWithNameInContext".} | |
template createModule*(ctx: ContextRef, name: cstring): ModuleRef = | |
llvmCreateModule(name, ctx) | |
proc dispose*(m: ModuleRef) {.importc: "LLVMDisposeModule".} | |
## Destroys a module | |
## Note: destroying an Execution Engine will also destroy modules attached to it | |
proc toIRString*(m: ModuleRef): cstring {.importc: "LLVMPrintModuleToString".} | |
## Print a module IR to textual IR string. The string must be disposed with LLVM "dispose" or memory will leak. | |
proc getContext*(m: ModuleRef): ContextRef {.importc: "LLVMGetModuleContext".} | |
proc getOrInsertNamedMetadata*(m: ModuleRef, name: openArray[char]): NamedMDNodeRef {.importc: "LLVMGetOrInsertNamedMetadata".} | |
proc addNamedMetadataOperand*(m: ModuleRef, name: cstring, val: ValueRef) {.importc: "LLVMAddNamedMetadataOperand".} | |
proc metadataNode*(ctx: ContextRef, metadataNodes: openarray[MetadataRef]): MetadataRef {.importc: "LLVMMDNodeInContext2".} | |
proc metadataNode*(ctx: ContextRef, str: openArray[char]): MetadataRef {.importc: "LLVMMDStringInContext2".} | |
proc asMetadataRef*(val: ValueRef): MetadataRef {.importc: "LLVMValueAsMetadata".} | |
proc asValueRef*(ctx: ContextRef, md: MetadataRef): ValueRef {.importc: "LLVMMetadataAsValue".} | |
{.pop.} # {.push header: "<llvm-c/Core.h>".} | |
{.push header: "<llvm-c/BitWriter.h>".} | |
proc writeBitcodeToFile*(m: ModuleRef, path: cstring) {.importc: "LLVMWriteBitcodeToFile".} | |
proc writeBitcodeToMemoryBuffer*(m: ModuleRef): MemoryBufferRef {.importc: "LLVMWriteBitcodeToMemoryBuffer".} | |
## Write bitcode to a memory buffer | |
## The MemoryBuffer must be disposed appropriately or memory will leak | |
{.pop.} # {.push header: "<llvm-c/BitWriter.h>".} | |
proc toBitcode*(m: ModuleRef): seq[byte] = | |
## Print a module IR to bitcode | |
let mb = m.writeBitcodeToMemoryBuffer() | |
let len = mb.getBufferSize() | |
result.newSeq(len) | |
copyMem(result[0].addr, mb.getBufferStart(), len) | |
mb.dispose() | |
type VerifierFailureAction* {.size: sizeof(cint).} = enum | |
AbortProcessAction # verifier will print to stderr and abort() | |
PrintMessageAction # verifier will print to stderr and return 1 | |
ReturnStatusAction # verifier will just return 1 | |
{.push header: "<llvm-c/Analysis.h>".} | |
proc verify*(module: ModuleRef, failureAction: VerifierFailureAction, msg: var cstring): LlvmBool {.importc: "LLVMVerifyModule".} | |
proc verify*(fn: ValueRef, failureAction: VerifierFailureAction): LlvmBool {.importc: "LLVMVerifyFunction".} | |
{.pop.} | |
# ############################################################ | |
# | |
# Target | |
# | |
# ############################################################ | |
{.push header: "<llvm-c/Target.h>".} | |
proc initializeNativeTarget*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeTarget".} | |
proc initializeNativeAsmPrinter*(): LlvmBool {.discardable, importc: "LLVMInitializeNativeAsmPrinter".} | |
proc getTargetFromName*(name: cstring): TargetRef {.importc: "LLVMGetTargetFromName".} | |
{.pop.} | |
{.push header: "<llvm-c/Core.h>".} | |
proc setTarget*(module: ModuleRef, triple: cstring) {.importc: "LLVMSetTarget".} | |
proc setDataLayout*(module: ModuleRef, layout: cstring) {.importc: "LLVMSetDataLayout".} | |
{.pop.} | |
# ############################################################ | |
# | |
# Execution Engine | |
# | |
# ############################################################ | |
{.push header: "<llvm-c/ExecutionEngine.h>".} | |
proc linkInMCJIT*() {.importc: "LLVMLinkInMCJIT".} | |
proc createJITCompilerForModule*( | |
engine: var ExecutionEngineRef, | |
module: ModuleRef, | |
optLevel: uint32, | |
err: var cstring): LlvmBool {.importc: "LLVMCreateJITCompilerForModule".} | |
proc remove*( | |
engine: ExecutionEngineRef, | |
m: ModuleRef, | |
outMod: var ModuleRef, | |
err: var cstring): LlvmBool {.importc: "LLVMRemoveModule".} | |
proc dispose*(engine: ExecutionEngineRef) {.importc: "LLVMDisposeExecutionEngine".} | |
## Destroys an execution engine | |
## Note: destroying an Execution Engine will also destroy modules attached to it | |
proc getFunctionAddress*(engine: ExecutionEngineRef, name: cstring): distinct pointer {.importc: "LLVMGetFunctionAddress".} | |
{.pop} | |
# ############################################################ | |
# | |
# Types | |
# | |
# ############################################################ | |
# https://llvm.org/doxygen/group__LLVMCCoreType.html | |
type | |
TypeKind* {.size: sizeof(cint).} = enum | |
tkVoid, ## type with no size | |
tkHalf, ## 16 bit floating point type | |
tkFloat, ## 32 bit floating point type | |
tkDouble, ## 64 bit floating point type | |
tkX86_FP80, ## 80 bit floating point type (X87) | |
tkFP128, ## 128 bit floating point type (112-bit mantissa) | |
tkPPC_FP128, ## 128 bit floating point type (two 64-bits) | |
tkLabel, ## Labels | |
tkInteger, ## Arbitrary bit width integers | |
tkFunction, ## Functions | |
tkStruct, ## Structures | |
tkArray, ## Arrays | |
tkPointer, ## Pointers | |
tkVector, ## Fixed width SIMD vector type | |
tkMetadata, ## Metadata | |
tkX86_MMX, ## X86 MMX | |
tkToken, ## Tokens | |
tkScalableVector, ## Scalable SIMD vector type | |
tkBFloat, ## 16 bit brain floating point type | |
tkX86_AMX ## X86 AMX | |
{.push header: "<llvm-c/Core.h>".} | |
proc getTypeKind*(ty: TypeRef): TypeKind {.importc: "LLVMGetTypeKind".} | |
proc void_t*(ctx: ContextRef): TypeRef {.importc: "LLVMVoidTypeInContext".} | |
# Integers | |
# ------------------------------------------------------------ | |
proc int1_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt1TypeInContext".} | |
proc int8_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt8TypeInContext".} | |
proc int16_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt16TypeInContext".} | |
proc int32_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt32TypeInContext".} | |
proc int64_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt64TypeInContext".} | |
proc int128_t*(ctx: ContextRef): TypeRef {.importc: "LLVMInt128TypeInContext".} | |
proc int_t*(ctx: ContextRef, numBits: uint32): TypeRef {.importc: "LLVMIntTypeInContext".} | |
# Composite | |
# ------------------------------------------------------------ | |
proc struct_t*( | |
ctx: ContextRef, | |
elemTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header | |
packed: LlvmBool): TypeRef {.importc: "LLVMStructTypeInContext".} | |
proc array_t*(elemType: TypeRef, elemCount: uint32): TypeRef {.importc: "LLVMArrayType".} | |
proc pointerType(elementType: TypeRef; addressSpace: cuint): TypeRef {.importc: "LLVMPointerType".} | |
# Functions | |
# ------------------------------------------------------------ | |
proc function_t*( | |
returnType: TypeRef, | |
paramTypes: openArray[TypeRef], # requires implicit conversion of the length to uint32, which requires the header | |
isVarArg: LlvmBool): TypeRef {.importc: "LLVMFunctionType".} | |
proc addFunction*(m: ModuleRef, name: cstring, ty: TypeRef): ValueRef {.importc: "LLVMAddFunction".} | |
## Declare a function `name` in a module. | |
## Returns a handle to specify its instructions | |
proc printTypeToString(ty: TypeRef): cstring {.importc: "LLVMPrintTypeToString".} | |
# TODO: Function and Parameter attributes: | |
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#function-attributes | |
# - https://www.llvm.org/docs/LangRef.html?highlight=attribute#parameter-attributes | |
# | |
# We can use attributes to specify additional guarantees of Constantine code, for instance: | |
# - "pure" function with: nounwind, readonly | |
# - pointer particularities: readonly, writeonly, noalias, inalloca, byval | |
proc getReturnType*(functionTy: TypeRef): TypeRef {.importc: "LLVMGetReturnType".} | |
{.pop.} # {.push header: "<llvm-c/Core.h>".} | |
# ------------------------------ | |
proc `$`*(ty: TypeRef): string = | |
let s = ty.printTypeToString() | |
result = $s | |
s.dispose() | |
proc isVoid*(ty: TypeRef): bool {.inline.} = | |
ty.getTypeKind == tkVoid | |
proc pointer_t*(elementTy: TypeRef): TypeRef {.inline.} = | |
pointerType(elementTy, addressSpace = 0) | |
# ############################################################ | |
# | |
# Values | |
# | |
# ############################################################ | |
{.push header: "<llvm-c/Core.h>".} | |
proc getTypeOf*(x: ValueRef): TypeRef {.importc: "LLVMTypeOf".} | |
# Constants | |
# ------------------------------------------------------------ | |
# https://llvm.org/doxygen/group__LLVMCCoreValueConstant.html | |
proc constInt*(ty: TypeRef, n: culonglong, signExtend: LlvmBool): ValueRef {.importc: "LLVMConstInt".} | |
proc constReal*(ty: TypeRef, n: cdouble): ValueRef {.importc: "LLVMConstReal".} | |
proc constNull*(ty: TypeRef): ValueRef {.importc: "LLVMConstNull".} | |
proc constAllOnes*(ty: TypeRef): ValueRef {.importc: "LLVMConstAllOnes".} | |
proc constStruct*( | |
vals: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header | |
packed: LlvmBool): ValueRef {.importc: "LLVMConstStruct".} | |
proc constArray*( | |
ty: TypeRef, | |
constantVals: openArray[ValueRef] # requires implicit conversion of the length to uint32, which requires the header | |
): ValueRef {.importc: "LLVMConstArray".} | |
{.pop.} # {.push header: "<llvm-c/Core.h>".} | |
# ############################################################ | |
# | |
# IR builder | |
# | |
# ############################################################ | |
# https://llvm.org/doxygen/group__LLVMCCoreInstructionBuilder.html | |
type | |
BasicBlockRef* {.importc: "LLVMBasicBlockRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
BuilderRef* {.importc: "LLVMBuilderRef", header: "<llvm-c/Core.h>".} = distinct pointer | |
## An instruction builder represents a point within a basic block and is | |
## the exclusive means of building instructions using the C interface. | |
IntPredicate* {.size: sizeof(cint).} = enum | |
IntEQ = 32, ## equal | |
IntNE, ## not equal | |
IntUGT, ## unsigned greater than | |
IntUGE, ## unsigned greater or equal | |
IntULT, ## unsigned less than | |
IntULE, ## unsigned less or equal | |
IntSGT, ## signed greater than | |
IntSGE, ## signed greater or equal | |
IntSLT, ## signed less than | |
IntSLE ## signed less or equal | |
{.push header: "<llvm-c/Core.h>".} | |
# Instantiation | |
# ------------------------------------------------------------ | |
proc appendBasicBlock*(ctx: ContextRef, fn: ValueRef, name: cstring): BasicBlockRef {.importc: "LLVMAppendBasicBlockInContext".} | |
## Append a basic block to the end of a function | |
proc createBuilder*(ctx: ContextRef): BuilderRef {.importc: "LLVMCreateBuilderInContext".} | |
proc dispose*(builder: BuilderRef) {.importc: "LLVMDisposeBuilder".} | |
# Functions | |
# ------------------------------------------------------------ | |
proc getParam*(fn: ValueRef, index: uint32): ValueRef {.importc: "LLVMGetParam".} | |
proc retVoid*(builder: BuilderRef): ValueRef {.importc: "LLVMBuildRetVoid".} | |
proc ret*(builder: BuilderRef, returnVal: ValueRef) {.importc: "LLVMBuildRet".} | |
# Positioning | |
# ------------------------------------------------------------ | |
proc position*(builder: BuilderRef, blck: BasicBlockRef, instr: ValueRef) {.importc: "LLVMPositionBuilder".} | |
proc positionBefore*(builder: BuilderRef, instr: ValueRef) {.importc: "LLVMPositionBuilderBefore".} | |
proc positionAtEnd*(builder: BuilderRef, blck: BasicBlockRef) {.importc: "LLVMPositionBuilderAtEnd".} | |
# Intermediate Representation | |
# ------------------------------------------------------------ | |
# | |
# - NSW: no signed wrap, signed value cannot over- or underflow. | |
# - NUW: no unsigned wrap, unsigned value cannot over- or underflow. | |
proc add*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAdd".} | |
proc addNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWAdd".} | |
proc addNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWAdd".} | |
proc sub*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSub".} | |
proc subNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWSub".} | |
proc subNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWSub".} | |
proc neg*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNeg".} | |
proc negNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWNeg".} | |
proc negNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWNeg".} | |
proc mul*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildMul".} | |
proc mulNSW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNSWMul".} | |
proc mulNUW*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNUWMul".} | |
proc divU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildUDiv".} | |
proc divU_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactUDiv".} | |
proc divS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSDiv".} | |
proc divS_exact*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildExactSDiv".} | |
proc remU*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildURem".} | |
proc remS*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildSRem".} | |
proc lshl*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildShl".} | |
proc lshr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLShr".} | |
proc ashr*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAShr".} | |
proc `and`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildAnd".} | |
proc `or`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildOr".} | |
proc `xor`*(builder: BuilderRef, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildXor".} | |
proc `not`*(builder: BuilderRef, val: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".} | |
proc select*(builder: BuilderRef, condition, then, otherwise: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildNot".} | |
proc icmp*(builder: BuilderRef, op: IntPredicate, lhs, rhs: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildICmp".} | |
proc bitcast*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildBitcast".} | |
proc trunc*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildTrunc".} | |
proc zext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildZExt".} | |
## Zero-extend | |
proc sext*(builder: BuilderRef, val: ValueRef, destTy: TypeRef, name: cstring) {.importc: "LLVMBuildSExt".} | |
## Sign-extend | |
proc malloc*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildMalloc".} | |
proc mallocArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildMallocArray".} | |
proc free*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildFree".} | |
proc alloca*(builder: BuilderRef, ty: TypeRef): ValueRef {.importc: "LLVMBuildAlloca".} | |
proc allocaArray*(builder: BuilderRef, ty: TypeRef, val: ValueRef): ValueRef {.importc: "LLVMBuildAllocaArray".} | |
proc getElementPtr2*( | |
builder: BuilderRef, | |
ty: TypeRef, | |
`ptr`: ValueRef, | |
indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header | |
name: cstring | |
): ValueRef {.importc: "LLVMBuildGEP2".} | |
## https://www.llvm.org/docs/GetElementPtr.html | |
proc getElementPtr2_InBounds*( | |
builder: BuilderRef, | |
ty: TypeRef, | |
`ptr`: ValueRef, | |
indices: openArray[ValueRef], # requires implicit conversion of the length to uint32, which requires the header | |
name: cstring | |
): ValueRef {.importc: "LLVMBuildInBoundsGEP2".} | |
## https://www.llvm.org/docs/GetElementPtr.html | |
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation. | |
## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning. | |
proc getElementPtr2_Struct*( | |
builder: BuilderRef, | |
ty: TypeRef, | |
`ptr`: ValueRef, | |
idx: uint32, | |
name: cstring | |
): ValueRef {.importc: "LLVMBuildStructGEP2".} | |
## https://www.llvm.org/docs/GetElementPtr.html | |
## If the GEP lacks the inbounds keyword, the value is the result from evaluating the implied two’s complement integer computation. | |
## However, since there’s no guarantee of where an object will be allocated in the address space, such values have limited meaning. | |
proc load2*(builder: BuilderRef, ty: TypeRef, `ptr`: ValueRef, name: cstring): ValueRef {.importc: "LLVMBuildLoad2".} | |
proc store*(builder: BuilderRef, val, `ptr`: ValueRef): ValueRef {.importc: "LLVMBuildStore".} | |
proc memset*(builder: BuilderRef, `ptr`, val, len: ValueRef, align: uint32) {.importc: "LLVMBuildMemset".} | |
proc memcpy*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemcpy".} | |
proc memmove*(builder: BuilderRef, dst: ValueRef, dstAlign: uint32, src: ValueRef, srcAlign: uint32, size: ValueRef) {.importc: "LLVMBuildMemmove".} | |
{.pop.} # {.push header: "<llvm-c/Core.h>".} | |
# ############################################################ | |
# | |
# Sanity Check | |
# | |
# ############################################################ | |
when isMainModule: | |
echo "LLVM JIT compiler sanity check" | |
let ctx = createContext() | |
var module = ctx.createModule("addition") | |
let i32 = ctx.int32_t() | |
let addType = function_t(i32, [i32, i32], isVarArg = LlvmBool(false)) | |
let addBody = module.addFunction("add", addType) | |
let builder = ctx.createBuilder() | |
let blck = ctx.append_basic_block(addBody, "addBody") | |
builder.positionAtEnd(blck) | |
block: | |
let a = addBody.getParam(0) | |
let b = addBody.getParam(1) | |
let sum = builder.add(a, b, "sum") | |
builder.ret(sum) | |
block: | |
var errMsg: cstring | |
let errCode = module.verify(AbortProcessAction, errMsg) | |
echo "Verification: code ", int(errCode), ", message \"", errMsg, "\"" | |
errMsg.dispose() | |
var engine: ExecutionEngineRef | |
block: | |
let errCode = initializeNativeTarget() | |
echo "Target init: code ", int(errCode) | |
let errCodeASMPrinter = initializeNativeAsmPrinter() # Why is this necessary? Otherwise we get "LLVM ERROR: Target does not support MC emission!" | |
echo "ASM printer init: code ", int(errCodeASMPrinter) | |
var errMsg: cstring | |
if bool createJITCompilerForModule(engine, module, optLevel = 0, errMsg): | |
if errMsg.len > 0: | |
echo errMsg | |
echo "exiting ..." | |
else: | |
echo "JIT compiler: error without details ... exiting" | |
quit 1 | |
let jitAdd = cast[proc(a, b: int32): int32 {.noconv.}]( | |
engine.getFunctionAddress("add")) | |
echo "jitAdd(1, 2) = ", jitAdd(1, 2) | |
doAssert jitAdd(1, 2) == 1 + 2 | |
block: | |
# Cleanup | |
# Note: when disposing the Execution Engine, attached modules are also disposed. | |
# here we go the extra mile of detaching the module for testing. | |
builder.dispose() | |
var errMsg: cstring | |
let errCode = engine.remove(module, module, errMsg) | |
echo "Detaching module from Execution Engine: code ", int(errCode), ", message \"", errMsg, "\"" | |
module.dispose() | |
engine.dispose() | |
ctx.dispose() | |
echo "LLVM JIT - SUCCESS" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Constantine | |
# Copyright (c) 2018-2019 Status Research & Development GmbH | |
# Copyright (c) 2020-Present Mamy André-Ratsimbazafy | |
# Licensed and distributed under either of | |
# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). | |
# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). | |
# at your option. This file may not be copied, modified, or distributed except according to those terms. | |
# ############################################################ | |
# | |
# Bindings to Nvidia GPUs libraries | |
# | |
# ############################################################ | |
import ./llvm | |
type Flag*[E: enum] = distinct cint | |
func flag*[E: enum](e: varargs[E]): Flag[E] {.inline.} = | |
## Enum should only have power of 2 fields | |
# static: | |
# for val in E: | |
# assert (ord(val) and (ord(val) - 1)) == 0, "Enum values should all be power of 2, found " & | |
# $val & " with value " & $ord(val) & "." | |
var flags = 0 | |
for val in e: | |
flags = flags or ord(val) | |
result = Flag[E](flags) | |
# ############################################################ | |
# | |
# Cuda | |
# | |
# ############################################################ | |
{.passl: "-L/opt/cuda/lib64 -lcuda".} | |
# Cuda offers 2 APIs: | |
# - cuda.h the driver API | |
# - cuda_runtime.h the runtime API | |
# | |
# https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html | |
# | |
# We need to use the lower-level driver API for JIT modules loading and reloading | |
type | |
CUresult* {.size: sizeof(cint).} = enum | |
## The API call returned with no errors. In the case of query calls, this | |
## also means that the operation being queried is complete (see | |
## ::cuEventQuery() and ::cuStreamQuery()). | |
CUDA_SUCCESS = 0 | |
## This indicates that one or more of the parameters passed to the API call | |
## is not within an acceptable range of values. | |
CUDA_ERROR_INVALID_VALUE = 1 | |
## The API call failed because it was unable to allocate enough memory to | |
## perform the requested operation. | |
CUDA_ERROR_OUT_OF_MEMORY = 2 | |
## This indicates that the CUDA driver has not been initialized with | |
## ::cuInit() or that initialization has failed. | |
CUDA_ERROR_NOT_INITIALIZED = 3 | |
## This indicates that the CUDA driver is in the process of shutting down. | |
CUDA_ERROR_DEINITIALIZED = 4 | |
## This indicates profiler is not initialized for this run. This can | |
## happen when the application is running with external profiling tools | |
## like visual profiler. | |
CUDA_ERROR_PROFILER_DISABLED = 5 | |
## to attempt to enable/disable the profiling via ::cuProfilerStart or | |
## ::cuProfilerStop without initialization. | |
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6 | |
## to call cuProfilerStart() when profiling is already enabled. | |
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7 | |
## to call cuProfilerStop() when profiling is already disabled. | |
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8 | |
## This indicates that the CUDA driver that the application has loaded is a | |
## stub library. Applications that run with the stub rather than a real | |
## driver loaded will result in CUDA API returning this error. | |
CUDA_ERROR_STUB_LIBRARY = 34 | |
## This indicates that requested CUDA device is unavailable at the current | |
## time. Devices are often unavailable due to use of | |
## ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED. | |
CUDA_ERROR_DEVICE_UNAVAILABLE = 46 | |
## This indicates that no CUDA-capable devices were detected by the installed | |
## CUDA driver. | |
CUDA_ERROR_NO_DEVICE = 100 | |
## This indicates that the device ordinal supplied by the user does not | |
## correspond to a valid CUDA device or that the action requested is | |
## invalid for the specified device. | |
CUDA_ERROR_INVALID_DEVICE = 101 | |
## This error indicates that the Grid license is not applied. | |
CUDA_ERROR_DEVICE_NOT_LICENSED = 102 | |
## This indicates that the device kernel image is invalid. This can also | |
## indicate an invalid CUDA module. | |
CUDA_ERROR_INVALID_IMAGE = 200 | |
## This most frequently indicates that there is no context bound to the | |
## current thread. This can also be returned if the context passed to an | |
## API call is not a valid handle (such as a context that has had | |
## ::cuCtxDestroy() invoked on it). This can also be returned if a user | |
## mixes different API versions (i.e. 3010 context with 3020 API calls). | |
## See ::cuCtxGetApiVersion() for more details. | |
CUDA_ERROR_INVALID_CONTEXT = 201 | |
## This indicated that the context being supplied as a parameter to the | |
## API call was already the active context. | |
## error to attempt to push the active context via ::cuCtxPushCurrent(). | |
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202 | |
## This indicates that a map or register operation has failed. | |
CUDA_ERROR_MAP_FAILED = 205 | |
## This indicates that an unmap or unregister operation has failed. | |
CUDA_ERROR_UNMAP_FAILED = 206 | |
## This indicates that the specified array is currently mapped and thus | |
## cannot be destroyed. | |
CUDA_ERROR_ARRAY_IS_MAPPED = 207 | |
## This indicates that the resource is already mapped. | |
CUDA_ERROR_ALREADY_MAPPED = 208 | |
## This indicates that there is no kernel image available that is suitable | |
## for the device. This can occur when a user specifies code generation | |
## options for a particular CUDA source file that do not include the | |
## corresponding device configuration. | |
CUDA_ERROR_NO_BINARY_FOR_GPU = 209 | |
## This indicates that a resource has already been acquired. | |
CUDA_ERROR_ALREADY_ACQUIRED = 210 | |
## This indicates that a resource is not mapped. | |
CUDA_ERROR_NOT_MAPPED = 211 | |
## This indicates that a mapped resource is not available for access as an | |
## array. | |
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212 | |
## This indicates that a mapped resource is not available for access as a | |
## pointer. | |
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213 | |
## This indicates that an uncorrectable ECC error was detected during | |
## execution. | |
CUDA_ERROR_ECC_UNCORRECTABLE = 214 | |
## This indicates that the ::CUlimit passed to the API call is not | |
## supported by the active device. | |
CUDA_ERROR_UNSUPPORTED_LIMIT = 215 | |
## This indicates that the ::CUcontext passed to the API call can | |
## only be bound to a single CPU thread at a time but is already | |
## bound to a CPU thread. | |
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216 | |
## This indicates that peer access is not supported across the given | |
## devices. | |
CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217 | |
## This indicates that a PTX JIT compilation failed. | |
CUDA_ERROR_INVALID_PTX = 218 | |
## This indicates an error with OpenGL or DirectX context. | |
CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219 | |
## This indicates that an uncorrectable NVLink error was detected during the | |
## execution. | |
CUDA_ERROR_NVLINK_UNCORRECTABLE = 220 | |
## This indicates that the PTX JIT compiler library was not found. | |
CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221 | |
## This indicates that the provided PTX was compiled with an unsupported toolchain. | |
CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222 | |
## This indicates that the PTX JIT compilation was disabled. | |
CUDA_ERROR_JIT_COMPILATION_DISABLED = 223 | |
## This indicates that the ::CUexecAffinityType passed to the API call is not | |
## supported by the active device. | |
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224 | |
## This indicates that the device kernel source is invalid. This includes | |
## compilation/linker errors encountered in device code or user error. | |
CUDA_ERROR_INVALID_SOURCE = 300 | |
## This indicates that the file specified was not found. | |
CUDA_ERROR_FILE_NOT_FOUND = 301 | |
## This indicates that a link to a shared object failed to resolve. | |
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302 | |
## This indicates that initialization of a shared object failed. | |
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303 | |
## This indicates that an OS call failed. | |
CUDA_ERROR_OPERATING_SYSTEM = 304 | |
## This indicates that a resource handle passed to the API call was not | |
## valid. Resource handles are opaque types like ::CUstream and ::CUevent. | |
CUDA_ERROR_INVALID_HANDLE = 400 | |
## This indicates that a resource required by the API call is not in a | |
## valid state to perform the requested operation. | |
CUDA_ERROR_ILLEGAL_STATE = 401 | |
## This indicates that a named symbol was not found. Examples of symbols | |
## are global/constant variable names, driver function names, texture names, | |
## and surface names. | |
CUDA_ERROR_NOT_FOUND = 500 | |
## This indicates that asynchronous operations issued previously have not | |
## completed yet. This result is not actually an error, but must be indicated | |
## differently than ::CUDA_SUCCESS (which indicates completion). Calls that | |
## may return this value include ::cuEventQuery() and ::cuStreamQuery(). | |
CUDA_ERROR_NOT_READY = 600 | |
## While executing a kernel, the device encountered a | |
## load or store instruction on an invalid memory address. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_ILLEGAL_ADDRESS = 700 | |
## This indicates that a launch did not occur because it did not have | |
## appropriate resources. This error usually indicates that the user has | |
## attempted to pass too many arguments to the device kernel, or the | |
## kernel launch specifies too many threads for the kernel's register | |
## count. Passing arguments of the wrong size (i.e. a 64-bit pointer | |
## when a 32-bit int is expected) is equivalent to passing too many | |
## arguments and can also result in this error. | |
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701 | |
## This indicates that the device kernel took too long to execute. This can | |
## only occur if timeouts are enabled - see the device attribute | |
## ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_LAUNCH_TIMEOUT = 702 | |
## This error indicates a kernel launch that uses an incompatible texturing | |
## mode. | |
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703 | |
## This error indicates that a call to ::cuCtxEnablePeerAccess() is | |
## trying to re-enable peer access to a context which has already | |
## had peer access to it enabled. | |
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704 | |
## This error indicates that ::cuCtxDisablePeerAccess() is | |
## trying to disable peer access which has not been enabled yet | |
## via ::cuCtxEnablePeerAccess(). | |
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705 | |
## This error indicates that the primary context for the specified device | |
## has already been initialized. | |
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708 | |
## This error indicates that the context current to the calling thread | |
## has been destroyed using ::cuCtxDestroy, or is a primary context which | |
## has not yet been initialized. | |
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709 | |
## A device-side assert triggered during kernel execution. The context | |
## cannot be used anymore, and must be destroyed. All existing device | |
## memory allocations from this context are invalid and must be | |
## reconstructed if the program is to continue using CUDA. | |
CUDA_ERROR_ASSERT = 710 | |
## This error indicates that the hardware resources required to enable | |
## peer access have been exhausted for one or more of the devices | |
## passed to ::cuCtxEnablePeerAccess(). | |
CUDA_ERROR_TOO_MANY_PEERS = 711 | |
## This error indicates that the memory range passed to ::cuMemHostRegister() | |
## has already been registered. | |
CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712 | |
## This error indicates that the pointer passed to ::cuMemHostUnregister() | |
## does not correspond to any currently registered memory region. | |
CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713 | |
## While executing a kernel, the device encountered a stack error. | |
## This can be due to stack corruption or exceeding the stack size limit. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_HARDWARE_STACK_ERROR = 714 | |
## While executing a kernel, the device encountered an illegal instruction. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_ILLEGAL_INSTRUCTION = 715 | |
## While executing a kernel, the device encountered a load or store instruction | |
## on a memory address which is not aligned. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_MISALIGNED_ADDRESS = 716 | |
## While executing a kernel, the device encountered an instruction | |
## which can only operate on memory locations in certain address spaces | |
## (global, shared, or local), but was supplied a memory address not | |
## belonging to an allowed address space. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_INVALID_ADDRESS_SPACE = 717 | |
## While executing a kernel, the device program counter wrapped its address space. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_INVALID_PC = 718 | |
## An exception occurred on the device while executing a kernel. Common | |
## causes include dereferencing an invalid device pointer and accessing | |
## out of bounds shared memory. Less common cases can be system specific - more | |
## information about these cases can be found in the system specific user guide. | |
## This leaves the process in an inconsistent state and any further CUDA work | |
## will return the same error. To continue using CUDA, the process must be terminated | |
## and relaunched. | |
CUDA_ERROR_LAUNCH_FAILED = 719 | |
## This error indicates that the number of blocks launched per grid for a kernel that was | |
## launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice | |
## exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor | |
## or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors | |
## as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. | |
CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720 | |
## This error indicates that the attempted operation is not permitted. | |
CUDA_ERROR_NOT_PERMITTED = 800 | |
## This error indicates that the attempted operation is not supported | |
## on the current system or device. | |
CUDA_ERROR_NOT_SUPPORTED = 801 | |
## This error indicates that the system is not yet ready to start any CUDA | |
## work. To continue using CUDA, verify the system configuration is in a | |
## valid state and all required driver daemons are actively running. | |
## More information about this error can be found in the system specific | |
## user guide. | |
CUDA_ERROR_SYSTEM_NOT_READY = 802 | |
## This error indicates that there is a mismatch between the versions of | |
## the display driver and the CUDA driver. Refer to the compatibility documentation | |
## for supported versions. | |
CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803 | |
## This error indicates that the system was upgraded to run with forward compatibility | |
## but the visible hardware detected by CUDA does not support this configuration. | |
## Refer to the compatibility documentation for the supported hardware matrix or ensure | |
## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES | |
## environment variable. | |
CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804 | |
## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. | |
CUDA_ERROR_MPS_CONNECTION_FAILED = 805 | |
## This error indicates that the remote procedural call between the MPS server and the MPS client failed. | |
CUDA_ERROR_MPS_RPC_FAILURE = 806 | |
## This error indicates that the MPS server is not ready to accept new MPS client requests. | |
## This error can be returned when the MPS server is in the process of recovering from a fatal failure. | |
CUDA_ERROR_MPS_SERVER_NOT_READY = 807 | |
## This error indicates that the hardware resources required to create MPS client have been exhausted. | |
CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808 | |
## This error indicates the the hardware resources required to support device connections have been exhausted. | |
CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809 | |
## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. | |
CUDA_ERROR_MPS_CLIENT_TERMINATED = 810 | |
## This error indicates that the operation is not permitted when | |
## the stream is capturing. | |
CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900 | |
## This error indicates that the current capture sequence on the stream | |
## has been invalidated due to a previous error. | |
CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901 | |
## This error indicates that the operation would have resulted in a merge | |
## of two independent capture sequences. | |
CUDA_ERROR_STREAM_CAPTURE_MERGE = 902 | |
## This error indicates that the capture was not initiated in this stream. | |
CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903 | |
## This error indicates that the capture sequence contains a fork that was | |
## not joined to the primary stream. | |
CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904 | |
## This error indicates that a dependency would have been created which | |
## crosses the capture sequence boundary. Only implicit in-stream ordering | |
## dependencies are allowed to cross the boundary. | |
CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905 | |
## This error indicates a disallowed implicit dependency on a current capture | |
## sequence from cudaStreamLegacy. | |
CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906 | |
## This error indicates that the operation is not permitted on an event which | |
## was last recorded in a capturing stream. | |
CUDA_ERROR_CAPTURED_EVENT = 907 | |
## A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED | |
## argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a | |
## different thread. | |
CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908 | |
## This error indicates that the timeout specified for the wait operation has lapsed. | |
CUDA_ERROR_TIMEOUT = 909 | |
## This error indicates that the graph update was not performed because it included | |
## changes which violated constraints specific to instantiated graph update. | |
CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910 | |
## This indicates that an async error has occurred in a device outside of CUDA. | |
## If CUDA was waiting for an external device's signal before consuming shared data, | |
## the external device signaled an error indicating that the data is not valid for | |
## consumption. This leaves the process in an inconsistent state and any further CUDA | |
## work will return the same error. To continue using CUDA, the process must be | |
## terminated and relaunched. | |
CUDA_ERROR_EXTERNAL_DEVICE = 911 | |
## Indicates a kernel launch error due to cluster misconfiguration. | |
CUDA_ERROR_INVALID_CLUSTER_SIZE = 912 | |
## This indicates that an unknown internal error has occurred. | |
CUDA_ERROR_UNKNOWN = 999 | |
template check*(status: CUresult) = | |
## Check the status code of a CUDA operation | |
## Exit program with error if failure | |
let code = status # ensure that the input expression is evaluated once only | |
if code != CUDA_SUCCESS: | |
echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code | |
echo $code | |
quit 1 | |
type | |
CUdevice* = distinct int32 | |
## Compute Device handle | |
CUdevice_attribute* {.size: sizeof(cint).} = enum | |
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, ## Maximum number of threads per block */ | |
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, ## Maximum block dimension X */ | |
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, ## Maximum block dimension Y */ | |
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, ## Maximum block dimension Z */ | |
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, ## Maximum grid dimension X */ | |
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, ## Maximum grid dimension Y */ | |
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, ## Maximum grid dimension Z */ | |
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, ## Maximum shared memory available per block in bytes */ | |
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, ## Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ | |
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, ## Warp size in threads */ | |
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, ## Maximum pitch in bytes allowed by memory copies */ | |
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, ## Maximum number of 32-bit registers available per block */ | |
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, ## Typical clock frequency in kilohertz */ | |
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, ## Alignment requirement for textures */ | |
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, ## Number of multiprocessors on device */ | |
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, ## Specifies whether there is a run time limit on kernels */ | |
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, ## Device is integrated with host memory */ | |
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, ## Device can map host memory into CUDA address space */ | |
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, ## Compute mode (See ::CUcomputemode for details) */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, ## Maximum 1D texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, ## Maximum 2D texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, ## Maximum 2D texture height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, ## Maximum 3D texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, ## Maximum 3D texture height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, ## Maximum 3D texture depth */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, ## Maximum 2D layered texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, ## Maximum 2D layered texture height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, ## Maximum layers in a 2D layered texture */ | |
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, ## Alignment requirement for surfaces */ | |
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, ## Device can possibly execute multiple kernels concurrently */ | |
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, ## Device has ECC support enabled */ | |
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, ## PCI bus ID of the device */ | |
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, ## PCI device ID of the device */ | |
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, ## Device is using TCC driver model */ | |
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, ## Peak memory clock frequency in kilohertz */ | |
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, ## Global memory bus width in bits */ | |
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, ## Size of L2 cache in bytes */ | |
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, ## Maximum resident threads per multiprocessor */ | |
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, ## Number of asynchronous engines */ | |
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, ## Device shares a unified address space with the host */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, ## Maximum 1D layered texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, ## Maximum layers in a 1D layered texture */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, ## Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, ## Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, ## Alternate maximum 3D texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, ## Alternate maximum 3D texture height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, ## Alternate maximum 3D texture depth */ | |
CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, ## PCI domain ID of the device */ | |
CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, ## Pitch alignment requirement for textures */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, ## Maximum cubemap texture width/height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, ## Maximum cubemap layered texture width/height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, ## Maximum layers in a cubemap layered texture */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, ## Maximum 1D surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, ## Maximum 2D surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, ## Maximum 2D surface height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, ## Maximum 3D surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, ## Maximum 3D surface height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, ## Maximum 3D surface depth */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, ## Maximum 1D layered surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, ## Maximum layers in a 1D layered surface */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, ## Maximum 2D layered surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, ## Maximum 2D layered surface height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, ## Maximum layers in a 2D layered surface */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, ## Maximum cubemap surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, ## Maximum cubemap layered surface width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, ## Maximum layers in a cubemap layered surface */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, ## Maximum 2D linear texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, ## Maximum 2D linear texture height */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, ## Maximum 2D linear texture pitch in bytes */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, ## Maximum mipmapped 2D texture width */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, ## Maximum mipmapped 2D texture height */ | |
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, ## Major compute capability version number */ | |
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, ## Minor compute capability version number */ | |
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, ## Maximum mipmapped 1D texture width */ | |
CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, ## Device supports stream priorities */ | |
CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, ## Device supports caching globals in L1 */ | |
CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, ## Device supports caching locals in L1 */ | |
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, ## Maximum shared memory available per multiprocessor in bytes */ | |
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, ## Maximum number of 32-bit registers available per multiprocessor */ | |
CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, ## Device can allocate managed memory on this system */ | |
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, ## Device is on a multi-GPU board */ | |
CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, ## Unique id for a group of devices on the same multi-GPU board */ | |
CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, ## Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ | |
CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, ## Ratio of single precision performance (in floating-point operations per second) to double precision performance */ | |
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, ## Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ | |
CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, ## Device can coherently access managed memory concurrently with the CPU */ | |
CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, ## Device supports compute preemption. */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, ## Device can access host registered memory at the same virtual address as the CPU */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, ## ::cuStreamBatchMemOp and related APIs are supported. */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, ## 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, ## ::CU_STREAM_WAIT_VALUE_NOR is supported. */ | |
CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, ## Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ | |
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, ## Maximum optin shared memory per block */ | |
CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, ## The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ | |
CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, ## Device supports host memory registration via ::cudaHostRegister. */ | |
CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, ## Device accesses pageable memory via the host's page tables. */ | |
CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, ## The host can directly access managed memory on the device without migration. */ | |
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, ## Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ | |
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, ## Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ | |
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, ## Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ | |
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, ## Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ | |
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, ## Maximum number of blocks per multiprocessor */ | |
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, ## Device supports compression of memory */ | |
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, ## Maximum L2 persisting lines capacity setting in bytes. */ | |
CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, ## Maximum value of CUaccessPolicyWindow::num_bytes. */ | |
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, ## Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ | |
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, ## Shared memory reserved by CUDA driver per block in bytes */ | |
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, ## Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ | |
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, ## Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ | |
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, ## External timeline semaphore interop is supported on the device */ | |
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, ## Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ | |
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, ## Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ | |
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, ## The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ | |
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, ## GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ | |
CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, ## Handle types supported with mempool based IPC */ | |
CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, ## Indicates device supports cluster launch */ | |
CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, ## Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, ## 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */ | |
CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, ## ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */ | |
CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, ## Device supports buffer sharing with dma_buf mechanism. */ | |
CU_DEVICE_ATTRIBUTE_MAX | |
CUcontext* = distinct pointer | |
CUmodule* = distinct pointer | |
CUfunction* = distinct pointer | |
CUstream* = distinct pointer | |
CUdeviceptr* = distinct pointer | |
{.push cdecl, importc, dynlib: "libcuda.so".} | |
proc cuInit*(flags: uint32): CUresult | |
proc cuDeviceGetCount*(count: var int32): CUresult | |
proc cuDeviceGet*(device: var CUdevice, ordinal: int32): CUresult | |
proc cuDeviceGetName*(name: ptr char, len: int32, dev: CUdevice): CUresult | |
proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevice): CUresult | |
proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult | |
proc cuCtxDestroy*(ctx: CUcontext): CUresult | |
proc cuCtxSynchronize*(ctx: CUcontext): CUresult | |
proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult | |
proc cuModuleUnload*(module: CUmodule): CUresult | |
proc cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult | |
proc cuLaunchKernel*( | |
kernel: CUfunction, | |
gridDimX, gridDimY, gridDimZ: uint32, | |
blockDimX, blockDimY, blockDimZ: uint32, | |
sharedMemBytes: uint32, | |
stream: CUstream, | |
kernelParams: ptr pointer, | |
extra: ptr pointer | |
): CUresult | |
type | |
CUmemAttach_flags* = enum | |
CU_MEM_ATTACH_GLOBAL = 0x1, ## Memory can be accessed by any stream on any device | |
CU_MEM_ATTACH_HOST = 0x2, ## Memory cannot be accessed by any stream on any device | |
CU_MEM_ATTACH_SINGLE = 0x4 | |
proc cuMemAlloc*(devptr: var CUdeviceptr, size: csize_t): CUresult | |
proc cuMemAllocManaged*(devptr: var CUdeviceptr, size: csize_t, flags: Flag[CUmemAttach_flags]): CUresult | |
proc cuMemFree*(devptr: CUdeviceptr): CUresult | |
proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult | |
proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult | |
{.pop.} # {.push cdecl, importc, dynlib: "libcuda.so".} | |
func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}= | |
cuModuleLoadData(module, sourceCode[0].unsafeAddr) | |
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}= | |
cuModuleGetFunction(kernel, module, fnName[0].unsafeAddr) | |
proc cudaDeviceInit(): CUdevice = | |
check cuInit(0) | |
var devCount: int32 | |
check cuDeviceGetCount(devCount) | |
if devCount == 0: | |
echo "cudaDeviceInit error: no devices supporting CUDA" | |
quit 1 | |
var cuDevice: CUdevice | |
check cuDeviceGet(cuDevice, 0) | |
var name = newString(128) | |
check cuDeviceGetName(name[0].addr, name.len.int32, cuDevice) | |
echo "Using CUDA Device [0]: ", name | |
var major, minor: int32 | |
check cuDeviceGetAttribute(major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice) | |
check cuDeviceGetAttribute(minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice) | |
echo "Compute Capability: SM ", major, ".", minor | |
if major < 6: | |
echo "Error: Device 0 is not sm_60 (Pascal generation, GTX 1080) or later" | |
quit 1 | |
return cuDevice | |
# ############################################################ | |
# | |
# NVVM | |
# | |
# ############################################################ | |
{.passl: "-L/opt/cuda/nvvm/lib64 -lnvvm".} | |
type | |
NvvmResult* {.size: sizeof(cint).} = enum | |
NVVM_SUCCESS = 0 | |
NVVM_ERROR_OUT_OF_MEMORY = 1 | |
NVVM_ERROR_PROGRAM_CREATION_FAILURE = 2 | |
NVVM_ERROR_IR_VERSION_MISMATCH = 3 | |
NVVM_ERROR_INVALID_INPUT = 4 | |
NVVM_ERROR_INVALID_PROGRAM = 5 | |
NVVM_ERROR_INVALID_IR = 6 | |
NVVM_ERROR_INVALID_OPTION = 7 | |
NVVM_ERROR_NO_MODULE_IN_PROGRAM = 8 | |
NVVM_ERROR_COMPILATION = 9 | |
NvvmProgram = distinct pointer | |
{.push cdecl, importc, dynlib: "libnvvm.so".} | |
proc nvvmGetErrorString*(r: NvvmResult): cstring | |
proc nvvmVersion*(major, minor: var int32): NvvmResult | |
proc nvvmIRVersion*(majorIR, minorIR, majorDbg, minorDbg: var int32): NvvmResult | |
proc nvvmCreateProgram*(prog: var NvvmProgram): NvvmResult | |
proc nvvmDestroyProgram*(prog: var NvvmProgram): NvvmResult | |
proc nvvmAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult | |
proc nvvmLazyAddModuleToProgram*(prog: NvvmProgram, buffer: openArray[byte], name: cstring): NvvmResult | |
proc nvvmCompileProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult | |
proc nvvmVerifyProgram*(prog: NvvmProgram; numOptions: int32; options: cstringArray): NvvmResult | |
proc nvvmGetCompiledResultSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult | |
proc nvvmGetCompiledResult*(prog: NvvmProgram; buffer: ptr char): NvvmResult | |
proc nvvmGetProgramLogSize*(prog: NvvmProgram; bufferSizeRet: var csize_t): NvvmResult | |
proc nvvmGetProgramLog*(prog: NvvmProgram; buffer: ptr char): NvvmResult | |
{.pop.} # {.push cdecl, importc, header: "<nvvm.h>".} | |
# ############################################################ | |
# | |
# NVVM IR | |
# | |
# ############################################################ | |
proc tagCudaKernel*(module: ModuleRef, function: ValueRef) = | |
## Tag a function as a Cuda Kernel, i.e. callable from host | |
# Upstream bug, getReturnType returns tkFunction for void functions. | |
# doAssert function.getTypeOf().getReturnType().isVoid(), block: | |
# "Kernels must not return values but function returns " & $function.getTypeOf().getReturnType().getTypeKind() | |
let ctx = module.getContext() | |
module.addNamedMetadataOperand( | |
"nvvm.annotations", | |
ctx.asValueRef(ctx.metadataNode([ | |
function.asMetadataRef(), | |
ctx.metadataNode("kernel"), | |
constInt(ctx.int32_t(), 1, LlvmBool(false)).asMetadataRef() | |
])) | |
) | |
# ############################################################ | |
# | |
# Sanity Check | |
# | |
# ############################################################ | |
when isMainModule: | |
template check(status: NvvmResult) = | |
let code = status # Assign so execution is done once only. | |
if code != NVVM_SUCCESS: | |
echo astToStr(status), " ", instantiationInfo(), " exited with error: ", code | |
echo code.nvvmGetErrorString() | |
quit 1 | |
echo "Nvidia JIT compiler sanity check" | |
####################################### | |
# Metadata | |
const triple = "nvptx64-nvidia-cuda" | |
var irVersion: tuple[major, minor, majorDbg, minorDbg: int32] | |
block: | |
var version: tuple[major, minor: int32] | |
check: nvvmVersion(version.major, version.minor) | |
echo "nvvm v", version.major, ".", version.minor | |
check: nvvmIRVersion(irVersion.major, irVersion.minor, irVersion.majorDbg, irVersion.minorDbg) | |
echo "requires LLVM IR v", irVersion.major, ".", irVersion.minor | |
####################################### | |
# LLVM IR codegen | |
# Datalayout for NVVM IR 1.8 (CUDA 11.6) | |
const datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" | |
let ctx = createContext() | |
var module = ctx.createModule("test_nnvm") | |
module.setTarget(triple) | |
module.setDataLayout(datalayout) | |
let i128 = ctx.int128_t() | |
let void_t = ctx.void_t() | |
let builder = ctx.createBuilder() | |
block: | |
let addType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) | |
let addKernel = module.addFunction("addKernel", addType) | |
let blck = ctx.append_basic_block(addKernel, "addBody") | |
builder.positionAtEnd(blck) | |
let r = addKernel.getParam(0) | |
let a = addKernel.getParam(1) | |
let b = addKernel.getParam(2) | |
let sum = builder.add(a, b, "sum") | |
discard builder.store(sum, r) | |
discard builder.retVoid() | |
module.tagCudaKernel(addKernel) | |
block: | |
let mulType = function_t(void_t, [i128.pointer_t(), i128, i128], isVarArg = LlvmBool(false)) | |
let mulKernel = module.addFunction("mulKernel", mulType) | |
let blck = ctx.append_basic_block(mulKernel, "mulBody") | |
builder.positionAtEnd(blck) | |
let r = mulKernel.getParam(0) | |
let a = mulKernel.getParam(1) | |
let b = mulKernel.getParam(2) | |
let prod = builder.mul(a, b, "prod") | |
discard builder.store(prod, r) | |
discard builder.retVoid() | |
module.tagCudaKernel(mulKernel) | |
block: | |
var errMsg: cstring | |
let errCode = module.verify(AbortProcessAction, errMsg) | |
echo "LLVM verification: code ", int(errCode), ", message \"", errMsg, "\"" | |
errMsg.dispose() | |
block: | |
echo "=================" | |
echo "LLVM IR output" | |
let ir = module.toIRString() | |
echo ir | |
ir.dispose() | |
echo "=================" | |
####################################### | |
# LLVM -> NNVM handover | |
var prog{.noInit.}: NvvmProgram | |
check nvvmCreateProgram(prog) | |
# module.writeBitcodeToFile("arith.bc") | |
let bitcode = module.toBitcode() | |
check nvvmAddModuleToProgram(prog, bitcode, "arith") | |
# Cleanup LLVM | |
builder.dispose() | |
module.dispose() | |
ctx.dispose() | |
####################################### | |
# GPU codegen | |
check nvvmVerifyProgram(prog, 0, nil) | |
block: | |
var logSize: csize_t | |
check nvvmGetProgramLogSize(prog, logSize) | |
var log = newString(logSize) | |
check nvvmGetProgramLog(prog, log[0].addr) | |
echo "log:" | |
echo log | |
echo "----------------" | |
check nvvmCompileProgram(prog, 0, nil) | |
var ptxSize: csize_t | |
check nvvmGetCompiledResultSize(prog, ptxSize) | |
var ptx = newString(ptxSize) | |
check nvvmGetCompiledResult(prog, ptx[0].addr) | |
block: | |
var logSize: csize_t | |
check nvvmGetProgramLogSize(prog, logSize) | |
var log = newString(logSize) | |
check nvvmGetProgramLog(prog, log[0].addr) | |
echo "log:" | |
echo log | |
echo "----------------" | |
check nvvmDestroyProgram(prog) | |
echo "=================" | |
echo "PTX output" | |
echo ptx | |
echo "=================" | |
let cudaDevice = cudaDeviceInit() | |
var cuCtx: CUcontext | |
var cuMod: CUmodule | |
var addKernel, mulKernel: CUfunction | |
check cuCtxCreate(cuCtx, 0, cudaDevice) | |
check cuModuleLoadData(cuMod, ptx) | |
check cuModuleGetFunction(addKernel, cuMod, "addKernel") | |
check cuModuleGetFunction(mulKernel, cuMod, "mulKernel") | |
####################################### | |
# Kernel launch | |
func toHex*(a: uint64): string = | |
const hexChars = "0123456789abcdef" | |
const L = 2*sizeof(uint64) | |
result = newString(L) | |
var a = a | |
for j in countdown(result.len-1, 2): | |
result[j] = hexChars[a and 0xF] | |
a = a shr 4 | |
func toString*(a: openArray[uint64]): string = | |
result = "0x" | |
for i in countdown(result.len-1, 0): | |
result.add toHex(a[i]) | |
var r{.noInit.}, a, b: array[2, uint64] | |
a[1] = 0x00000000000001FF'u64; a[0] = 0xFFFFFFFFFFFFFFFF'u64 | |
b[1] = 0x0000000000000000'u64; b[0] = 0x0010000000000000'u64 | |
echo "r: ", r.toString() | |
echo "a: ", a.toString() | |
echo "b: ", b.toString() | |
var rGPU: CUdeviceptr | |
check cuMemAlloc(rGPU, csize_t sizeof(r)) | |
let params = [pointer(rGPU.addr), pointer(a.addr), pointer(b.addr)] | |
check cuLaunchKernel( | |
addKernel, | |
1, 1, 1, | |
1, 1, 1, | |
0, CUstream(nil), | |
params[0].unsafeAddr, nil) | |
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) | |
echo "a+b: ", r.toString() | |
check cuLaunchKernel( | |
mulKernel, | |
1, 1, 1, | |
1, 1, 1, | |
0, CUstream(nil), | |
params[0].unsafeAddr, nil) | |
check cuMemcpyDtoH(r.addr, rGPU, csize_t sizeof(r)) | |
echo "a*b: ", r.toString() | |
####################################### | |
# Cleanup | |
check cuMemFree(rGPU) | |
rGPU = CUdeviceptr(nil) | |
check cuModuleUnload(cuMod) | |
cuMod = CUmodule(nil) | |
check cuCtxDestroy(cuCtx) | |
cuCtx = CUcontext(nil) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment