Matrix 访问矩阵作为其在CUDA中的平铺矩阵交互应用中的转换

Matrix 访问矩阵作为其在CUDA中的平铺矩阵交互应用中的转换,matrix,cuda,multiplication,Matrix,Cuda,Multiplication,我目前正在试验CUDA,我从矩阵乘法的答案中发现了这个内核: 我不想做A*B,而是做A_转置*A,但不保存A_转置(仅将矩阵A作为内核的输入)。我必须正确地设置索引,但是我被这个矩阵表示法弄糊涂了。任何帮助都将不胜感激。您需要的大部分是和 在第一个链接中,确定AxAT涉及矩阵A行的内积,类似地,ATxA将涉及矩阵A列的内积。还请注意对称声明。在第二个链接中(在编程指南中从该点向下滚动一点),您将发现一个完整的平铺矩阵乘法。您只需要按列索引到这两个分幅中 下面是一个工作示例,使用以下代码: $ca

我目前正在试验CUDA,我从矩阵乘法的答案中发现了这个内核:

我不想做A*B,而是做A_转置*A,但不保存A_转置(仅将矩阵A作为内核的输入)。我必须正确地设置索引,但是我被这个矩阵表示法弄糊涂了。任何帮助都将不胜感激。

您需要的大部分是和

在第一个链接中,确定AxAT涉及矩阵A行的内积,类似地,ATxA将涉及矩阵A列的内积。还请注意对称声明。在第二个链接中(在编程指南中从该点向下滚动一点),您将发现一个完整的平铺矩阵乘法。您只需要按列索引到这两个分幅中

下面是一个工作示例,使用以下代码:

$cat t1654.cu
#包括
#包括
#包括
const int TILE_DIM=32;
模板
__全局无效ATA(常量T*\U限制\U\A、T*\U限制\U\C、整数ARows、整数ACols)
{
T c值=0;
int Row=块IDX.y*平铺尺寸+螺纹IDX.y;
int Col=blockIdx.x*TILE_DIM+threadIdx.x;
__共享为[TILE_DIM][TILE_DIM];
__共享分片[分片][分片];
对于(int k=0;k<(平铺尺寸+ARows-1)/平铺尺寸;k++){
if(k*TILE_DIM+threadIdx.y$ cat t1654.cu
#include <iostream>
#include <cstdio>
#include <cstdlib>

const int TILE_DIM = 32;
template <typename T>
__global__ void ATA(const T * __restrict__  A, T * __restrict__  C, int ARows, int ACols)
{
    T CValue = 0;

    int Row = blockIdx.y*TILE_DIM + threadIdx.y;
    int Col = blockIdx.x*TILE_DIM + threadIdx.x;

    __shared__ T As[TILE_DIM][TILE_DIM];
    __shared__ T Bs[TILE_DIM][TILE_DIM];

    for (int k = 0; k < (TILE_DIM + ARows - 1)/TILE_DIM; k++) {

         if (k*TILE_DIM + threadIdx.y < ARows && blockIdx.y*blockDim.y+threadIdx.x < ACols)
             As[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + blockIdx.y*blockDim.y+threadIdx.x];
         else
             As[threadIdx.y][threadIdx.x] = 0.0;

         if (k*TILE_DIM + threadIdx.y < ARows && Col < ACols)
             Bs[threadIdx.y][threadIdx.x] = A[(k*TILE_DIM + threadIdx.y)*ACols + Col];
         else
             Bs[threadIdx.y][threadIdx.x] = 0.0;

         __syncthreads();

         for (int n = 0; n < TILE_DIM; ++n)
             CValue += As[n][threadIdx.y] * Bs[n][threadIdx.x];

         __syncthreads();
    }

    if (Row < ACols && Col < ACols)
        C[((blockIdx.y * blockDim.y + threadIdx.y)*ACols) +
           (blockIdx.x * blockDim.x)+ threadIdx.x] = CValue;
}

template <typename T>
__global__ void transpose_naive(const T * __restrict__ in, T * __restrict__ out, const int dim){
  int col = threadIdx.x+blockDim.x*blockIdx.x;
  int row = threadIdx.y+blockDim.y*blockIdx.y;
  if ((col < dim) && (row < dim)) out[col*dim+row] = in[row*dim+col];
}

template <typename T>
__global__ void mm_naive(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int rowA, const int colA, const int colB){
  int col = threadIdx.x+blockDim.x*blockIdx.x;
  int row = threadIdx.y+blockDim.y*blockIdx.y;
  if ((row < rowA) && (col < colB)){
    T Cval = 0;
    for (int i = 0; i < colA; i++) Cval += A[row*colA+i]*B[i*colB+col];
    C[row*colB+col] = Cval;}
}


typedef float mt;
int main(){

  mt *d_A, *d_B, *d_C, *h_A, *h_C, *h_C1;
  int m = 64;
  int n = 64;
  h_A  = new mt[m*n];
  h_C  = new mt[n*n];
  h_C1 = new mt[n*n];
  cudaMalloc(&d_A, m*n*sizeof(d_A[0]));
  cudaMalloc(&d_B, m*n*sizeof(d_A[0]));
  cudaMalloc(&d_C, n*n*sizeof(d_C[0]));
  // test 1
  for (int i = 0; i < m; i++)
    for (int j = 0; j < n; j++)
      h_A[i*n+j] = (i==j)?1.0f:0.0f;
  cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
  dim3 block(TILE_DIM, TILE_DIM);
  dim3 grid((n+block.x-1)/block.x, (n+block.y-1)/block.y);
  ATA<<<grid,block>>>(d_A, d_C, m, n);
  cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  // test 2
  for (int i = 0; i < m; i++)
    for (int j = 0; j < n; j++)
      h_A[i*n+j] = rand()%10;
  cudaMemcpy(d_A, h_A, m*n*sizeof(d_A[0]), cudaMemcpyHostToDevice);
  ATA<<<grid,block>>>(d_A, d_C, m, n);
  cudaMemcpy(h_C, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  transpose_naive<<<grid,block>>>(d_A, d_B, n);
  mm_naive<<<grid,block>>>(d_B, d_A, d_C, n, n, n);
  cudaMemcpy(h_C1, d_C, n*n*sizeof(d_C[0]), cudaMemcpyDeviceToHost);
#ifdef DEBUG
  for (int i = 0; i < n; i++){
    for (int j = 0; j < n; j++)
      std::cout << h_C1[i*n+j] << " ";
    std::cout << std::endl;}
  std::cout << std::endl;
#endif
  for (int i = 0; i < n*n; i++) if (h_C[i] != h_C1[i]) {std::cout << "mismatch at: " << i << " was: " << h_C[i] << " should be: " << h_C1[i] << std::endl; return 0;}
}
$ nvcc -o t1654 t1654.cu
$ cuda-memcheck ./t1654
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$