Python PyOpenCL-kron内核
主要代码:Python PyOpenCL-kron内核,python,c,kernel,pyopencl,Python,C,Kernel,Pyopencl,主要代码: from definitions import * import pyopencl as cl import numpy from time import time N = 16 num_iter = 10 a = numpy.random.rand(N,N).astype(numpy.float32) b = numpy.random.rand(N,N).astype(numpy.float32) print('Starting native iterations...')
from definitions import *
import pyopencl as cl
import numpy
from time import time
N = 16
num_iter = 10
a = numpy.random.rand(N,N).astype(numpy.float32)
b = numpy.random.rand(N,N).astype(numpy.float32)
print('Starting native iterations...')
startTime = time()
for i in range(num_iter):
c = numpy.kron(a, b)
endTime = time()
print(endTime - startTime)
print()
context = cl.create_some_context()
queue = cl.CommandQueue(context)
h_C = numpy.empty(c.shape).astype(numpy.float32)
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=abig)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=bbig)
d_c = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, h_C.nbytes)
print('Starting GPU tiling iterations...')
kernelsource = open("../tiling.cl").read()
program = cl.Program(context, kernelsource).build()
kronecker = program.kronecker
kronecker.set_scalar_arg_dtypes([numpy.int32, None, None, None, None, None])
d_a = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=a)
d_b = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=b)
h_C = numpy.empty(c.shape).astype(numpy.float32)
startTime = time()
for n in range(num_iter):
# Work-group computes a block of C. This size is also set
# in a #define inside the kernel function. Note this blocksize
# must evenly divide the matrix order
blocksize = 16
A_block = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize*blocksize*blocksize)
B_block = cl.LocalMemory(numpy.dtype(numpy.float32).itemsize*blocksize*blocksize)
kronecker(queue, (N*N,N*N), (blocksize,blocksize), N, d_a, d_b, d_c, A_block, B_block)
queue.finish()
endTime = time()
cl.enqueue_copy(queue, h_C, d_c)
print(numpy.allclose(c,h_C))
print(endTime-startTime)
print()
内核代码:
__kernel void kronecker(
const unsigned int N,
__global const float* restrict A,
__global const float* restrict B,
__global float* restrict C,
__local float* restrict Awrk,
__local float* restrict Bwrk)
{
// This work-item will compute element C(i,j)
const int i = get_global_id(0);
const int j = get_global_id(1);
// Element C(i,j) is in block C(Iblk,Jblk)
const int Iblk = get_group_id(0);
const int Jblk = get_group_id(1);
// C(i,j) is element C(iloc, jloc) of block C(Iblk, Jblk)
const int iloc = get_local_id(0);
const int jloc = get_local_id(1);
// Construct an element of A and B to be shared between threads one element per thread.
Awrk[Iblk+Jblk*N] = A[Iblk+Jblk*N];
Bwrk[iloc+jloc*N] = B[iloc+jloc*N];
barrier(CLK_LOCAL_MEM_FENCE);
C[i+j*N*N] = Awrk[Iblk+Jblk*N] * Bwrk[iloc+jloc*N];
}
当N=16时(或当我可以选择blocksize=N
时),此代码100%工作。但由于硬件的限制,16是工作组维度的最大大小。我需要一种方法来泛化这段代码,以便处理任何显然不需要太多内存的N。问题是如果块大小
与N
不匹配,那么选择A和B元素的逻辑就不再合理了。具体而言,块和本地id移动得太快(在C
的blocksize-by-blocksize块之后,A
和B
的元素改变,而不是在N-by-N块之后)。解决这一问题的最佳方法是在处理id时在内核中添加模逻辑,还是嵌套内核调用,以便进一步划分每个工作组以匹配块大小