Python CUDA GPU的Numba代码编译失败,错误为:异常:访问冲突读取
我在为CUDA GPU编写Python/Numba代码时遇到了一个问题,我根本不知道我做错了什么,或者哪个组件坏了。在95%的情况下,程序没有编译,因为我在重新启动python内核并尝试运行我编写的CUDA内核时收到以下错误Python CUDA GPU的Numba代码编译失败,错误为:异常:访问冲突读取,python,gpu,numba,Python,Gpu,Numba,我在为CUDA GPU编写Python/Numba代码时遇到了一个问题,我根本不知道我做错了什么,或者哪个组件坏了。在95%的情况下,程序没有编译,因为我在重新启动python内核并尝试运行我编写的CUDA内核时收到以下错误 OSError Traceback (most recent call last) in 6 data_isgomoku = np.load("data/isgomoku_moreThan5Good.npy") 7 ----> 8 kernel_test_02[blo
OSError Traceback (most recent call last)
in
6 data_isgomoku = np.load("data/isgomoku_moreThan5Good.npy")
7
----> 8 kernel_test_02[blocks, threads_per_block](envs_in_gpu, envs_out_gpu, random_states, data_isgomoku, proximities)
9 print ("Run OKAY")
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in call(self, *args)
757 '''
758 #print ("_call *args=",*args)
--> 759 kernel = self.specialize(*args)
760 cfg = kernel[self.griddim, self.blockdim, self.stream, self.sharedmem]
761 cfg(*args)
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in specialize(self, *args)
768 argtypes = tuple(
769 [self.typingctx.resolve_argument_type(a) for a in args])
--> 770 kernel = self.compile(argtypes)
771 return kernel
772
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in compile(self, sig)
787 self.definitions[(cc, argtypes)] = kernel
788 if self.bind:
--> 789 kernel.bind()
790 return kernel
791
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in bind(self)
528 Force binding to current CUDA context
529 """
--> 530 self._func.get()
531
532 @Property
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in get(self)
405 cufunc = self.cache.get(device.id)
406 if cufunc is None:
--> 407 ptx = self.ptx.get()
408
409 # Link
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\compiler.py in get(self)
376 arch = nvvm.get_arch_option(*cc)
377 ptx = nvvm.llvm_to_ptx(self.llvmir, opt=3, arch=arch,
--> 378 **self._extra_options)
379 self.cache[cc] = ptx
380 if config.DUMP_ASSEMBLY:
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\cudadrv\nvvm.py in llvm_to_ptx(llvmir, **opts)
498 cu.add_module(libdevice.get())
499
--> 500 ptx = cu.compile(**opts)
501 # XXX remove debug_pubnames seems to be necessary sometimes
502 return patch_ptx_debug_pubnames(ptx)
~.conda\envs\tensorflow\lib\site-packages\numba\cuda\cudadrv\nvvm.py in compile(self, **options)
234 c_opts = (c_char_p * len(opts))(*[c_char_p(x.encode('utf8'))
235 for x in opts])
--> 236 err = self.driver.nvvmCompileProgram(self._handle, len(opts), c_opts)
237 self._try_error(err, 'Failed to compile\n')
238
OSError: exception: access violation reading 0x000001C3B042FFF8
就我所知,这显然是发送到NVVM编译器的代码的问题,因为它会使它崩溃。我使用CUDA Toolkit 9.2、10.0、10.1、10.2和各种版本的Numba进行了尝试,但问题始终存在
是否有人已经安装了Numba、NVIDIA CUDA Toolkit、Jupyter Notebook?请查看提供的代码和文件,看问题是否可以重现
有人知道我做错了什么吗
为了在Windows上重现此功能,请尝试几次:a。重新启动python内核和b。试着运行它。代码:
%load_ext autoreload
%autoreload 2
import numpy as np
from IPython.display import clear_output
import time
import os
import sys
import pickle
import random
import numba
from numba import jit,cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_uniform_float32
# constants
MOVE_E = 0
MOVE_X = 1
MOVE_O = 2
MOVE_I = 3
# board representation
gb_type = np.dtype([
('boardO',(np.int32,29*29)),
('boardX',(np.int32,29*29)),
('boardCommon',(np.int32,29*29)),
('moveProximities',(np.int32,29*29)),
('noOfMoves', np.int32),
('undo_moveProximitites', np.int32, (19 * 19, 5*5)),
('undo_moves', np.int32, (19 * 19, 4)), # 0 - pos, 1- lastMoveColor, posTypesX[movepos], posTypesO[movepos]
('doSaveUndoInformation', np.int32),
('lastMove', np.int32),
('lastMoveColor', np.int32),
('colorToMove', np.int32),
('gameEnded', np.int32),
('hashkey1', np.int32),
('hashkey2', np.int32),
('posTypesX',(np.int32,29*29)), # bitek: 0:gomoku, 1:D1, 2:D2, 3:A1, 4:A2, 5: W2, 6: A1xA2, 7: A2xA2, 8:A2 later A1
('posTypesO',(np.int32,29*29)),
('undo_movePosTypesX', np.int32, (19 * 19, 8*5)),
('undo_movePosTypesO', np.int32, (19 * 19, 8*5))
])
@cuda.jit(device=True)
def cuda_init(env, startFromCenter , startFromCentralHalf, proximities, random_states):
"init the board's data and calculate proximities"
thread_id = cuda.grid(1)
cuda_fillArrayWithZero(env.boardO)
cuda_fillArrayWithZero(env.boardX)
cuda_fillArrayWithZero(env.moveProximities)
cuda_fillArrayWithZero(env.boardCommon)
env.noOfMoves = 0
#env.undo_boardO[:,:] = 0
#env.undo_boardX[:,:] = 0
for i in range(19 * 19):
for j in range(4):
env.undo_moves[i,j] = 0
for j in range(5*5):
env.undo_moveProximitites[i,j] = 0
cuda_fillArrayWithZero(env.posTypesX)
cuda_fillArrayWithZero(env.posTypesO)
if startFromCenter:
gb_updateMoveProximities(env, gb_calcPosFromXY(9,9), proximities)
env.moveProximities[gb_calcPosFromXY(9,9)] = 1
elif startFromCentralHalf:
X = 4+int(14 * xoroshiro128p_uniform_float32(random_states, thread_id))
Y = 4 + int(14 * xoroshiro128p_uniform_float32(random_states, thread_id))
#X = random.randint(4, 18 - 4)
#Y = random.randint(4, 18 - 4)
gb_updateMoveProximities(env, gb_calcPosFromXY(X, Y), proximities)
env.moveProximities[gb_calcPosFromXY(X, Y)] = 1
env.lastMove = -1
env.lastMoveColor = 2
env.colorToMove = 1
env.doSaveUndoInformation = True
env.gameEnded = False
env.hashkey1 = 267425067
env.hashkey2 = 576253428
return
@cuda.jit(device=True)
def cuda_fillArrayWithZero(myarray):
"fill array with zero"
for i in range(myarray.shape[0]):
myarray[i] = 0
return
@cuda.jit("i4(i4,i4)",device=True)
def gb_calcPosFromXY( X, Y):
"convert coordinates"
if not(X>=0 and X<19): print ("Error calcPosFromXY, wrong X=" ,X)
if not(Y >= 0 and Y < 19): print("Error calcPosFromXY, wrong Y=" ,Y)
return (5+X) + 29*(5+Y)
@cuda.jit( "UniTuple(i8, 2)(i4)",device=True)
def gb_calcXYFromPos( pos):
"convert coordinates"
if pos is None:
return -1, -1
if not(pos>=0 and pos < 29*29): print ( "Error calcXYFromPos, wrong pos number=" , pos)
X = pos % 29 - 5
Y = int (((pos - 5 - X) / 29)) - 5
if not(X >= 0 and X < 19): print("Error calcXYFromPos, wrong X=", X)
if not(Y >= 0 and Y < 19): print("Error calcXYFromPos, wrong Y=", Y)
return X,Y
@cuda.jit(device=True)
def gb_updateMoveProximities(env, pos, proximities):
"set some positions to 1 using the coordinates in the pre-calculated proximities array"
if pos != -1:
k=0
while proximities[pos, k]>0:
kPos = proximities[pos, k]
#print (kPos, gb_calcXYFromPos(kPos))
if env.boardCommon[kPos] == 0:
env.moveProximities[kPos] = 1
k += 1
else:
pos = gb_calcPosFromXY(9,9)
k = 0
while proximities[pos, k] > 0:
kPos = proximities[pos, k]
env.moveProximities[kPos] = 1
k += 1
env.moveProximities[pos] = 1
# CUDA kernel test 1 - init board
# define 100 boards
N=100
envs = np.zeros(N, dtype=gb_type)
threads_per_block = 64
# Calculate the number of thread blocks in the grid
blocks = (envs.shape[0] + (threads_per_block - 1)) // threads_per_block
print ("threads_per_block = ", threads_per_block, "blockspergrid=", blocks)
random_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=1)
#kernel function
@cuda.jit()
def kernel_test_01(io_array, out_array, proximities, random_states):
thread_id = cuda.grid(1)
if thread_id < io_array.size:
cuda_init(io_array[thread_id], True, False, proximities, random_states)
out_array[thread_id] = io_array[thread_id]
# test kernel 1
envs_out = np.zeros((envs.shape[0]), dtype = gb_type)
envs_in_gpu = numba.cuda.to_device(envs)
envs_out_gpu = numba.cuda.to_device(envs_out)
#proximities = np.load("data/proximities_2.npy")
proximities = np.zeros((841, 25), dtype=np.int32)
kernel_test_01[blocks, threads_per_block](envs_in_gpu, envs_out_gpu, proximities, random_states)
print ("Run OKAY")
# CUDA kernel test 2 - call some more device functions
@cuda.jit(device=True)
def gb_calc_gindex(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10):
return int((((((((((((((((((i1 << 1) + i2) << 1) +
i3) << 1) + i4) << 1) + i5) << 1) +
i6) << 1) + i7) << 1) + i8) << 1) +
i9) << 1) + i10)
@cuda.jit(device=True)
def isPosValid(pos):
"check if a position is valid"
i = pos % 29 - 5
j = int (((pos - 5 - i) / 29)) - 5
if i>=0 and i<19 and j>=0 and j<19:
return True
return False
@cuda.jit(device=True)
def gb_check_if_gomokuPos(env , POS, color, dontCheckPos , data_isgomoku):
if not isPosValid(POS):
X,Y = gb_calcXYFromPos(POS)
print("Error gb_check_if_gomokuPos, wrong X or Y=", X, Y)
# ha foglalt
#if env.boardCommon[POS] != 0:
# return False
if not dontCheckPos:
if env.boardX[POS] != 0 or env.boardO[POS] != 0:
return False
board = env.boardX
if color == MOVE_O:
board = env.boardO
if board[POS-1] == 1 or board[POS+1] == 1:
index = gb_calc_gindex(board[POS-5],board[POS-4],board[POS-3],board[POS-2],board[POS-1],board[POS+1],board[POS+2],board[POS+3],board[POS+4],board[POS+5])
if index <0 or index > data_isgomoku.shape[0]: print ("Error gb_check_if_gomokuPos, wrong index=", index, 1)
if data_isgomoku[index]:
return True
if board[POS-29] == 1 or board[POS+29] == 1:
index = gb_calc_gindex(board[POS-145],board[POS-116],board[POS-87],board[POS-58],board[POS-29],board[POS+29],board[POS+58],board[POS+87],board[POS+116],board[POS+145])
if index < 0 or index > data_isgomoku.shape[0]: print("Error gb_check_if_gomokuPos, wrong index=", index, 2)
if data_isgomoku[index]:
return True
if board[POS-30] == 1 or board[POS+30] == 1:
index = gb_calc_gindex(board[POS-150],board[POS-120],board[POS-90],board[POS-60],board[POS-30],board[POS+30],board[POS+60],board[POS+90],board[POS+120],board[POS+150])
if index < 0 or index > data_isgomoku.shape[0]: print("Error gb_check_if_gomokuPos, wrong index=", index, 3)
if data_isgomoku[index]:
return True
if board[POS-28] == 1 or board[POS+28] == 1:
index = gb_calc_gindex(board[POS-140],board[POS-112],board[POS-84],board[POS-56],board[POS-28],board[POS+28],board[POS+56],board[POS+84],board[POS+112],board[POS+140])
if index < 0 or index > data_isgomoku.shape[0]: print("Error gb_check_if_gomokuPos, wrong index=", index, 4)
if data_isgomoku[index]:
return True
return False
@cuda.jit()
def kernel_test_02(io_array, out_array, random_states, data_isgomoku,proximities):
"this kernel serves only for testing purposes, it calls some of the device functions in order to check if they work properly"
thread_id = cuda.grid(1)
if thread_id < io_array.size:
X = 12
Y = 12
pos = gb_calcPosFromXY(X,Y)
env = io_array[thread_id]
#
cuda_init(io_array[thread_id], True, False, proximities, random_states)
gb_updateMoveProximities(env, pos, proximities)
v = gb_check_if_gomokuPos(env, pos, MOVE_X, False, data_isgomoku)
out_array[thread_id] = io_array[thread_id]
# test kernel 2
envs_out = np.zeros((envs.shape[0]), dtype = gb_type)
envs_in_gpu = numba.cuda.to_device(envs)
envs_out_gpu = numba.cuda.to_device(envs_out)
#proximities = np.load("data/proximities_2.npy")
proximities = np.zeros((841, 25), dtype=np.int32)
#data_isgomoku = np.load("data/isgomoku_moreThan5Good.npy")
data_isgomoku = np.zeros((1024), dtype=np.int32)
kernel_test_02[blocks, threads_per_block](envs_in_gpu, envs_out_gpu, random_states, data_isgomoku, proximities)
print ("Run OKAY")
%load\u ext autoreload
%自动加载2
将numpy作为np导入
从IPython.display导入清除输出
导入时间
导入操作系统
导入系统
进口泡菜
随机输入
进口麻木
来自numba进口jit,cuda
从numba.cuda.random import创建xoroshiro128p_状态,xoroshiro128p_统一32
#常数
移动E=0
移动X=1
移动_O=2
移动I=3
#董事会代表
gb_type=np.dtype([
('boardO',(np.int32,29*29)),
('boardX',(np.int32,29*29)),
(‘boardCommon’,(np.int32,29*29)),
('moveProximities',(np.int32,29*29)),
('noOfMoves',np.int32),
('undo_MoveProximites',np.int32,(19*19,5*5)),
('undo_moves',np.int32,(19*19,4)),#0-pos,1-lastMoveColor,posTypesX[movepos],posTypesO[movepos]
('doSaveUndoInformation',np.int32),
('lastMove',np.int32),
('lastMoveColor',np.int32),
('colorToMove',np.int32),
('gameEnded',np.int32),
('hashkey1',np.int32),
('hashkey2',np.int32),
#bitek:0:gomoku,1:D1,2:D2,3:A1,4:A2,5:W2,6:A1xA2,7:A2xA2,8:A2之后的A1
('posTypesO',(np.int32,29*29)),
('undo_movePosTypesX',np.int32,(19*19,8*5)),
('undo_movePosTypesO',np.int32,(19*19,8*5))
])
@jit(设备=True)
def cuda_init(环境、起始自中心、起始自中心、邻近、随机状态):
“初始化电路板数据并计算近似值”
线程id=cuda.grid(1)
cuda_FillaryWithzero(环境董事会)
cuda_fillArrayWithZero(环境板X)
cuda_fillArrayWithZero(环境移动近邻)
cuda_fillArrayWithZero(环境板公共)
env.noOfMoves=0
#环境撤销董事会[:,:]=0
#环境撤消_boardX[:,:]=0
对于范围(19*19)内的i:
对于范围(4)内的j:
环境撤销移动[i,j]=0
对于范围(5*5)内的j:
环境undo_MoveProximites[i,j]=0
cuda_fillArrayWithZero(环境posTypesX)
cuda_fillArrayWithZero(环境posTypesO)
如果从中心启动:
gb_updateMoveProximities(环境,gb_calcPosFromXY(9,9),proximities)
环境移动近邻[gb_calcPosFromXY(9,9)]=1
elif从中心开始,如图所示:
X=4+int(14*xoroshiro128p\u统一\u浮动32(随机状态,线程id))
Y=4+int(14*xoroshiro128p\u统一\u浮动32(随机状态,线程id))
#X=random.randint(4,18-4)
#Y=random.randint(4,18-4)
gb_updateMoveProximities(环境,gb_calcPosFromXY(X,Y),proximities)
环境移动近邻[gb_calcPosFromXY(X,Y)]=1
env.lastMove=-1
env.lastMoveColor=2
env.colorToMove=1
env.doSaveUndoInformation=True
env.gameend=False
env.hashkey1=267425067
env.hashkey2=576253428
返回
@jit(设备=True)
def cuda_fillArrayWithZero(myarray):
“用零填充数组”
对于范围内的i(myarray.shape[0]):
myarray[i]=0
返回
@jit(“i4(i4,i4)”,device=True)
def gb_calcPosFromXY(X,Y):
“转换坐标”
如果不是(X>=0,X=0,Y<19):打印(“错误calcPosFromXY,错误Y=”,Y)
返回(5+X)+29*(5+Y)
@jit(“UniTuple(i8,2)(i4)”,device=True)
def gb_calcXYFromPos(位置):
“转换坐标”
如果pos为无:
返回-1,-1
如果不是(位置>=0且位置<29*29):打印(“错误calcXYFromPos,错误的位置编号=”,位置)
X=位置%29-5
Y=int((位置-5-X)/29))-5
如果不是(X>=0且X<19):打印(“错误calcXYFromPos,错误X=”,X)
如果不是(Y>=0且Y<19):打印(“错误calcXYFromPos,错误Y=”,Y)
返回X,Y
@jit(设备=True)
def gb_updateMoveProximities(环境、位置、proximities):
“使用预先计算的近似值数组中的坐标将某些位置设置为1”
如果位置!=-1:
k=0
当接近度[pos,k]>0时:
kPos=接近度[pos,k]
#打印(KPO、gb_calcXYFromPos(KPO))
如果环境板公用[kPos]==0:
环境移动近邻[KPO]=1
k+=1
其他:
pos=gb_calcPosFromXY(9,9)
k=0
当接近度[pos,k]>0时:
kPos=接近度[pos,k]
环境移动近邻[KPO]=1
k+=1
环境移动近邻[pos]=1
#CUDA内核测试1-初始化板
#定义100个板
N=100
envs=np.zeros(N,dtype=gb\u类型)
每个块的线程数=64
#计算网格中螺纹块的数量
blocks=(envs.shape[0]+(threads\u per_block-1))//threads\u per_block
打印(“threads_per_block=,threads_per_block,”,blockspergrid=,blocks)
随机_状态=创建_xoroshiro128p_状态(每个_块的线程数*块,种子数=1)
#核函数
@cuda.jit()
def kernel_test_01(io_数组、out_数组、近邻、随机_状态):
线程id=cuda.grid(1)
如果线程id