June 16, 2020 15:15 · August 7, 2020 11:16 · August 7, 2020 11:23 · August 18, 2020 19:58 · August 18, 2020 19:59 · February 10, 2021 13:03
 # From Interval example docs - typing
 from numba import types
 from numba.core.extending import typeof_impl, type_callable

 # Data model

 from numba.core.extending import models, register_model

 # Lowering
 $ NUMBA_CUDA_LOG_LEVEL=DEBUG python repro.py 
 == CUDA [182] DEBUG -- call runtime api: cudaRuntimeGetVersion
 Define a kernel
 Copy to device
 == CUDA [200]  INFO -- init
 == CUDA [200] DEBUG -- call driver api: cuInit
 == CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent
 == CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent
 == CUDA [200] DEBUG -- call driver api: cuDeviceGetCount
 == CUDA [200] DEBUG -- call driver api: cuDeviceGet
 from numba import cuda, types
 import time

 def add(r, x, y):
    i = cuda.grid(1)

    if i < len(r):
        r[i] = x[i] + y[i]

 diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py
 index 24569f99b..505e6797f 100644
 --- a/numba/cuda/cudadrv/nvvm.py
 +++ b/numba/cuda/cudadrv/nvvm.py
 @@ -272,29 +272,39 @@ data_layout = {
 
 default_data_layout = data_layout[tuple.__itemsize__ * 8]
 
 +_supported_cc = None
 
 diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py
 index 24569f99b..505e6797f 100644
 --- a/numba/cuda/cudadrv/nvvm.py
 +++ b/numba/cuda/cudadrv/nvvm.py
 @@ -272,29 +272,39 @@ data_layout = {
 
 default_data_layout = data_layout[tuple.__itemsize__ * 8]
 
 +_supported_cc = None
 
 /*
 * To compile, execute on terminal:
 * g++ -o mcjit mcjit.cpp `llvm-config --cxxflags --ldflags --libs all --system-libs`
 */

 #include <iostream>
 #include <memory>

 #include <llvm/ADT/StringRef.h>
 #include <llvm/ExecutionEngine/ExecutionEngine.h>
 diff --git a/numba/core/compiler.py b/numba/core/compiler.py
 index a0c03fdb3..e47270a2f 100644
 --- a/numba/core/compiler.py
 +++ b/numba/core/compiler.py
 @@ -28,7 +28,8 @@ from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes,
                                      NopythonRewrites, PreParforPass,
                                      ParforPass, DumpParforDiagnostics,
                                      IRLegalization, NoPythonBackend,
 -                                     InlineOverloads, PreLowerStripPhis)
 +                                     InlineOverloads, PreLowerStripPhis,
 import pycuda.driver as cuda
 import pycuda.autoinit  # noqa

 from collections import namedtuple
 from pycuda.compiler import SourceModule

 import cupy as cp


 # PyCUDA will try to get a pointer to data from an object it doesn't recognise
 import numpy as np
 from numba import njit
 from time import perf_counter


 # From https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform
 def fwht(a) -> None:
    """In-place Fast Walsh–Hadamard Transform of array a."""
    h = 1
    while h < len(a):
 $ python wagg.py 
 Running with 16777216 elements, of which approximately 25.0% are zero

 There are 12584753 nonzeroes in:
 [0.417022   0.72032449 0.         ... 0.20570723 0.36716537 0.0979951 ]

 The kernel found 12584753 elements, resulting in the array:
 [0.14349547 0.43006714 0.48695992 ... 0.         0.         0.        ]

 Traceback (most recent call last):
	# From Interval example docs - typing
	from numba import types
	from numba.core.extending import typeof_impl, type_callable

	# Data model

	from numba.core.extending import models, register_model

	# Lowering
	$ NUMBA_CUDA_LOG_LEVEL=DEBUG python repro.py
	== CUDA [182] DEBUG -- call runtime api: cudaRuntimeGetVersion
	Define a kernel
	Copy to device
	== CUDA [200] INFO -- init
	== CUDA [200] DEBUG -- call driver api: cuInit
	== CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent
	== CUDA [200] DEBUG -- call driver api: cuCtxGetCurrent
	== CUDA [200] DEBUG -- call driver api: cuDeviceGetCount
	== CUDA [200] DEBUG -- call driver api: cuDeviceGet
	from numba import cuda, types
	import time

	def add(r, x, y):
	i = cuda.grid(1)

	if i < len(r):
	r[i] = x[i] + y[i]
	diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py
	index 24569f99b..505e6797f 100644
	--- a/numba/cuda/cudadrv/nvvm.py
	+++ b/numba/cuda/cudadrv/nvvm.py
	@@ -272,29 +272,39 @@ data_layout = {

	default_data_layout = data_layout[tuple.__itemsize__ * 8]

	+_supported_cc = None
	/*
	* To compile, execute on terminal:
	* g++ -o mcjit mcjit.cpp `llvm-config --cxxflags --ldflags --libs all --system-libs`
	*/

	#include <iostream>
	#include <memory>

	#include <llvm/ADT/StringRef.h>
	#include <llvm/ExecutionEngine/ExecutionEngine.h>
	diff --git a/numba/core/compiler.py b/numba/core/compiler.py
	index a0c03fdb3..e47270a2f 100644
	--- a/numba/core/compiler.py
	+++ b/numba/core/compiler.py
	@@ -28,7 +28,8 @@ from numba.core.typed_passes import (NopythonTypeInference, AnnotateTypes,
	NopythonRewrites, PreParforPass,
	ParforPass, DumpParforDiagnostics,
	IRLegalization, NoPythonBackend,
	- InlineOverloads, PreLowerStripPhis)
	+ InlineOverloads, PreLowerStripPhis,
	import pycuda.driver as cuda
	import pycuda.autoinit # noqa

	from collections import namedtuple
	from pycuda.compiler import SourceModule

	import cupy as cp


	# PyCUDA will try to get a pointer to data from an object it doesn't recognise
	import numpy as np
	from numba import njit
	from time import perf_counter


	# From https://en.wikipedia.org/wiki/Fast_Walsh%E2%80%93Hadamard_transform
	def fwht(a) -> None:
	"""In-place Fast Walsh–Hadamard Transform of array a."""
	h = 1
	while h < len(a):
	$ python wagg.py
	Running with 16777216 elements, of which approximately 25.0% are zero

	There are 12584753 nonzeroes in:
	[0.417022 0.72032449 0. ... 0.20570723 0.36716537 0.0979951 ]

	The kernel found 12584753 elements, resulting in the array:
	[0.14349547 0.43006714 0.48695992 ... 0. 0. 0. ]

	Traceback (most recent call last):