矩阵向量积CUDA通过平铺和共享内存提高性能
你好,我在CUDA内核中工作,关于矩阵向量积。我希望通过平铺和共享内存来提高性能。 问题在于,使用此代码时,M矩阵或N向量的加载不正确 您知道如何将磁贴从M和N加载到共享内存阵列中吗 M是矩阵,N是向量,p是矩阵向量积的结果矩阵向量积CUDA通过平铺和共享内存提高性能,cuda,gpu,gpgpu,Cuda,Gpu,Gpgpu,你好,我在CUDA内核中工作,关于矩阵向量积。我希望通过平铺和共享内存来提高性能。 问题在于,使用此代码时,M矩阵或N向量的加载不正确 您知道如何将磁贴从M和N加载到共享内存阵列中吗 M是矩阵,N是向量,p是矩阵向量积的结果 __global__ void matrixMul( float* P, float* M, float* N, int Mw, int Nw) { int bx = blockIdx.x; int by = blockIdx.y; int tx = thr
__global__ void matrixMul( float* P, float* M, float* N, int Mw, int Nw)
{
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
__shared__ float Ms[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Ns[BLOCK_SIZE];
// ===================================================================
// Code segment 1
// Determine the update values for the tile indices in the loop
// ===================================================================
int mBegin = Mw * BLOCK_SIZE * by;
int mEnd = mBegin + Mw - 1;
int mStep = BLOCK_SIZE;
int nBegin = BLOCK_SIZE * bx;
//int nStep = BLOCK_SIZE*Nw;
int nStep = 1;
float Psub = 0.0f;
// ===================================================================
// Code segment 2
// Do matrix-matrix multiplication inside a tile
// ===================================================================
for (int m = mBegin, n = nBegin; m <= mEnd; m += mStep, n += nStep) {
// Load a tile from M and N into the shared memory arrays
Ms[ty][tx] = M[bx*mStep*Mw+m];
Ns[ty] = N[by*nStep*Nw+n];
// Synchronize the threads
__syncthreads();
// Multiply the two tiles together, each thread accumulating
// the partial sum of a single dot product.
for (int i = 0; i < BLOCK_SIZE; i++) {
Psub += Ms[i][tx] * Ns[i];
}
// Synchronize again.
__syncthreads();
}
// ===================================================================
// Code segment 3
// Store the data back to global memory
// ===================================================================
int p = Nw * BLOCK_SIZE * by + BLOCK_SIZE * bx;
P[p + nStep] = Psub;
}
\uuuu全局\uuuuu无效矩阵mul(浮点*P,浮点*M,浮点*N,整数Mw,整数Nw)
{
int bx=blockIdx.x;int by=blockIdx.y;
int tx=threadIdx.x;int ty=threadIdx.y;
__共享浮点Ms[块大小][块大小];
__共享浮点数Ns[块大小];
// ===================================================================
//代码段1
//确定循环中平铺索引的更新值
// ===================================================================
int mBegin=Mw*块大小*by;
int mEnd=mBegin+Mw-1;
int mStep=块大小;
int nBegin=块大小*bx;
//int nStep=区块大小*Nw;
int nStep=1;
浮动Psub=0.0f;
// ===================================================================
//代码段2
//在平铺内进行矩阵乘法
// ===================================================================
对于(int m=mBegin,n=nBegin;m我发现了一个类似的例子(注意,处理大小相同的方阵),它也会将部分矩阵加载到共享内存中。看来您的声明是正确的,它可能只是归结为您用来确定哪些元素去了哪里的代数
__global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width){
__shared__float Mds[TILE_WIDTH][TILE_WIDTH]; // Shared memory
__shared__float Nds[TILE_WIDTH][TILE_WIDTH]; // declarations
int bx = blockIdx.x; int by = blockIdx.y; // ID thread
int tx = threadIdx.x; int ty = threadIdx.y;
// Identify the row and column of the Pd element to work on
int Row = by * TILE_WIDTH + ty;
int Col = bx * TILE_WIDTH + tx;
float Pvalue = 0; // REGISTER!
// Loop over the Md and Nd tiles required to compute the Pd element
for (int m = 0; m < Width/TILE_WIDTH; ++m) {
// Collaborative loading of Md and Nd tiles into shared memory
Mds[ty][tx] = Md[Row*Width + (m*TILE_WIDTH + tx)];
Nds[ty][tx] = Nd[Col + (m*TILE_WIDTH + ty)*Width];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
Pvalue += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
Pd[Row*Width+Col] = Pvalue;
}
\uuuuu全局\uuuuu无效矩阵mulkernel(float*Md、float*Nd、float*Pd、int-Width){
__共享浮动Mds[磁砖宽度][磁砖宽度];//共享内存
__共享浮点数Nds[TILE\u WIDTH][TILE\u WIDTH];//声明
int bx=blockIdx.x;int by=blockIdx.y;//ID线程
int tx=threadIdx.x;int ty=threadIdx.y;
//确定要处理的Pd元素的行和列
int Row=按*平铺宽度+ty;
int Col=bx*瓷砖宽度+tx;
float Pvalue=0;//寄存器!
//循环计算Pd元素所需的Md和Nd分片
对于(int m=0;m