In [1]:
#Check if numba can detect the GPU
!numba -s

System info:
--------------------------------------------------------------------------------
__Time Stamp__
Report started (local time)                   : 2025-02-28 13:27:42.408331
UTC start time                                : 2025-02-28 20:27:42.408346
Running time (s)                              : 15.112425

__Hardware Information__
Machine                                       : x86_64
CPU Name                                      : znver3
CPU Count                                     : 128
Number of accessible CPUs                     : 1
List of accessible CPUs cores                 : 0
CFS Restrictions (CPUs worth of runtime)      : None

CPU Features                                  : 64bit adx aes avx avx2 avx512bf16
                                                avx512bitalg avx512bw avx512cd
                                                avx512dq avx512f avx512ifma
                                                avx512vbmi avx512vbmi2 avx512vl
                        

In [2]:
import math
import numba.cuda as cuda

In [3]:
@cuda.jit
def vecAdd_cuda(z, x, y):
    # i = cuda.blockIdx.x*cuda.blockDim.x + threadIdx.x
    i = cuda.grid(1)
    N = x.shape[0]
    if i >= N:
        return

    z[i] = x[i] + math.exp(y[i])

In [4]:
import numpy as np

rng = np.random.default_rng()

N = 10240000
x = rng.random(N)
y = rng.random(N)

In [11]:
%%time
z_ref = x + np.exp(y)

CPU times: user 13.9 ms, sys: 12.5 ms, total: 26.3 ms
Wall time: 27.7 ms


In [6]:
d_x = cuda.to_device(x)
d_y = cuda.to_device(y)
d_z = cuda.device_array_like(x)

In [7]:
block_size = 128
num_blocks = (N-1)//block_size+1

In [12]:
%%time
vecAdd_cuda[num_blocks, block_size](d_z, d_x, d_y)

CPU times: user 189 μs, sys: 0 ns, total: 189 μs
Wall time: 192 μs


In [9]:
res = d_z.copy_to_host()

In [10]:
assert np.allclose(z_ref, res)