CUDA API Error On Python With Numba
Solution 1:
I think there may be 2 problems.
I'm not sure your use of
LOOKUP_TABLE = cuda.to_device(
outside of main is valid. I guess you are trying to create a device array, but I think you should be usingnumba.cuda.device_array()
for that.You don't seem to be transferring the array
y
to the device properly for use.
When I make those two changes, the code seems to run without CUDA runtime error for me:
# cat t1.py
import numpy as np
import numba as nb
from timeit import default_timer as timer
# from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32
"""
Look up table for factorial
"""
"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
result = 0
for i in range(arr.size):
result = result + arr[i]
return result
"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
result = 0
for i in range(arr1.size):
result = arr1[i]*arr2[i] + result
return result
"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
thread_id = cuda.threadIdx.x
arr[thread_id] = arr[thread_id]/div
"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
"""Find a value less than 1 from nomral distribution"""
thread_id = cuda.threadIdx.x
# draw candidate sample from normal distribution and store
# when less than 1
while True:
candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)
if candidate < 1:
out[thread_id] = candidate
break
"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma, lt):
thread_id = cuda.threadIdx.x
PI = 3.14159265359
# calculate the pdf/pmf of given state
Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
p2 = math.exp( mu ) * mu**y / lt[ y ]
out[thread_id] = weight[thread_id]*p2*p1_div_p3
"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
thread_id = cuda.threadIdx.x
# calculate phi distribution and subtract from 1
Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
phi_sub[ thread_id ] = 1 - phi[ thread_id ]
@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma,
greater, equal, phi, phi_sub, lt):
# thread/block index for accessing data
tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
ty = cuda.blockIdx.x # Block id in a 1D grid = event index
bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
pos = tx + ty * bw # computed flattened index inside the array
# get current event y_t
y_current = y[ ty ]
# get number of time steps
tn = y_current.size
# iterator over timestep
for i in range(1, tn):
# draw samples
sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
omega, particles[ty][i])
# get weight
sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], weight[ty][i], y_current[i], beta, omega, gamma, lt)
# normalize weight
weight_sum = arr_sum(weight[ty][i])
arr_div(weight[ty][i], weight_sum)
# calculate tau
sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)
# update greater and equal
greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)
def main():
beta = 1
omega = 1
gamma = 2
pn = 100
event_number = 50
timestep = 100
LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))
hy = np.ones((event_number, timestep), dtype = np.uint32)
print(hy.size)
print(hy)
y = cuda.to_device(hy)
particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
rng_states = create_xoroshiro128p_states(pn, seed=1)
start = timer()
SIR[event_number, pn](rng_states, y, particles, weight, beta, omega, gamma, greater, equal, phi, phi_sub, LOOKUP_TABLE)
vectoradd_time = timer() - start
print("sirModule1 took %f seconds" % vectoradd_time)
cuda.synchronize()
if __name__ == '__main__':
main()
# cuda-memcheck python t1.py
========= CUDA-MEMCHECK
5000
[[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
...,
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]]
sirModule1 took 0.840958 seconds
========= ERROR SUMMARY: 0 errors
#
Solution 2:
Solved! I am working on Ubuntu 16.04. When I installed Numba for the first time, numba.cuda functions worked fine. However later I encountered these kind of errors
raise CudaAPIError(retcode, msg)
CudaAPIError: Call to cuMemcpyHtoD results in CUDA_ERROR_LAUNCH_FAILED
These errors are encountered when you put your system on 'suspend'. In order to avoid such errors, restart your system or don't suspend.
Post a Comment for "CUDA API Error On Python With Numba"