Cuda 如何使用WMMA函数？；_Cuda_Gpgpu

Cuda 如何使用WMMA函数？；

cuda

Cuda 如何使用WMMA函数？；,cuda,gpgpu,Cuda,Gpgpu,我已经运行了，但是我得到了意想不到的结果。那么如何使用wmma函数呢？我的wmma:：load\u matrix\u sync错误吗？或者其他我们应该注意的事情 WMMA_M,WMMA_N,WMMA_K = 16 __global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size) { //Declare the fragment wmma::fragment<wmma::matrix_a, WMMA_M

我已经运行了，但是我得到了意想不到的结果。那么如何使用wmma函数呢？我的

wmma:：load\u matrix\u sync

错误吗？或者其他我们应该注意的事情

WMMA_M,WMMA_N,WMMA_K = 16

__global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size)
{
  //Declare the fragment
  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,float> acc_frag;

  //Load the matrix to fragment
  wmma::load_matrix_sync(a_frag, a, WMMA_M);
  wmma::load_matrix_sync(b_frag, b, WMMA_K);

  //perform mma
  wmma::fill_fragment(acc_frag, 0.0f);

  for(int i=0; i<1e4; i++)
    {
      wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
    }

  //store the result
  wmma::store_matrix_sync(c, acc_frag, WMMA_M, wmma::mem_row_major);
}

WMMA_M，WMMA_N，WMMA_K=16
__全局无效wmma内核（半*a，半*b，浮点*c，整数矩阵大小）
{
//声明片段
wmma：：片段a_frag；
wmma：：片段b_frag；
wmma：：碎片附件；
//将矩阵加载到片段
wmma：：加载矩阵同步（a_frag，a，wmma_M）；
wmma：：加载矩阵同步（b_frag，b，wmma_K）；
//表演mma
wmma：：填充碎片（附件碎片，0.0f）；
对于（int i=0；i
无法直接为主机上的半变量赋值
我建议切换到CUDA 10。它已使数据类型减少了一半
但是，无论是使用CUDA 9.2还是CUDA 10，以下示例都应同样适用：
$ cat t304.cu
#include <mma.h>
#include <iostream>

using namespace nvcuda;

__global__ void wmma_ker(half *a, half *b, float *c) {
   // Declare the fragments
   wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
   wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
   wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;

   // Initialize the output to zero
   wmma::fill_fragment(c_frag, 0.0f);

   // Load the inputs
   wmma::load_matrix_sync(a_frag, a, 16);
   wmma::load_matrix_sync(b_frag, b, 16);

   // Perform the matrix multiplication
   wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);

   // Store the output
   wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}

int main(){

  half *d_a, *h_a, *d_b, *h_b;
  float *d_c, *h_c;
  h_c = new float[16*16];
  h_b = new half[16*16];
  h_a = new half[16*16];
  cudaMalloc(&d_a, 16*16*sizeof(half));
  cudaMalloc(&d_b, 16*16*sizeof(half));
  cudaMalloc(&d_c, 16*16*sizeof(float));
  for (int i = 0; i < 16*16; i++) {
    h_a[i] = 1.0f;
    h_b[i] = 1.0f;}
  cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice);
  wmma_ker<<<1,32>>>(d_a, d_b, d_c);
  cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ",";
  std::cout << std::endl;
}
$ nvcc -arch=sm_70 -o t304 t304.cu
$ cuda-memcheck ./t304
========= CUDA-MEMCHECK
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
========= ERROR SUMMARY: 0 errors
$

对于已用时间测量，您的cudaEventSynchronize（stop）
和cudaEventRecord（stop）顺序错误。好的，我修复了这个问题。无法直接为主机上的半变量赋值。在将矩阵从主机复制到设备时，请注意浮点和半变量的大小。
$ cat t304.cu
#include <mma.h>
#include <iostream>

using namespace nvcuda;

__global__ void wmma_ker(half *a, half *b, float *c) {
   // Declare the fragments
   wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
   wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
   wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;

   // Initialize the output to zero
   wmma::fill_fragment(c_frag, 0.0f);

   // Load the inputs
   wmma::load_matrix_sync(a_frag, a, 16);
   wmma::load_matrix_sync(b_frag, b, 16);

   // Perform the matrix multiplication
   wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);

   // Store the output
   wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}

int main(){

  half *d_a, *h_a, *d_b, *h_b;
  float *d_c, *h_c;
  h_c = new float[16*16];
  h_b = new half[16*16];
  h_a = new half[16*16];
  cudaMalloc(&d_a, 16*16*sizeof(half));
  cudaMalloc(&d_b, 16*16*sizeof(half));
  cudaMalloc(&d_c, 16*16*sizeof(float));
  for (int i = 0; i < 16*16; i++) {
    h_a[i] = 1.0f;
    h_b[i] = 1.0f;}
  cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice);
  wmma_ker<<<1,32>>>(d_a, d_b, d_c);
  cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ",";
  std::cout << std::endl;
}
$ nvcc -arch=sm_70 -o t304 t304.cu
$ cuda-memcheck ./t304
========= CUDA-MEMCHECK
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
========= ERROR SUMMARY: 0 errors
$

$ nvprof ./t304
==28135== NVPROF is profiling process 28135, command: ./t304
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
==28135== Profiling application: ./t304
==28135== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   42.97%  3.2320us         2  1.6160us  1.4080us  1.8240us  [CUDA memcpy HtoD]
                   28.52%  2.1450us         1  2.1450us  2.1450us  2.1450us  [CUDA memcpy DtoH]
                   28.51%  2.1440us         1  2.1440us  2.1440us  2.1440us  wmma_ker(__half*, __half*, float*)
      API calls:   98.42%  498.63ms         3  166.21ms  5.2170us  498.61ms  cudaMalloc
                    1.06%  5.3834ms       384  14.019us     347ns  568.79us  cuDeviceGetAttribute
                    0.38%  1.9473ms         4  486.83us  250.95us  1.1810ms  cuDeviceTotalMem
                    0.10%  493.31us         4  123.33us  109.62us  140.63us  cuDeviceGetName
                    0.01%  68.566us         1  68.566us  68.566us  68.566us  cudaLaunchKernel
                    0.01%  67.104us         3  22.368us  9.6850us  30.563us  cudaMemcpy
                    0.00%  22.628us         4  5.6570us  3.1910us  9.2200us  cuDeviceGetPCIBusId
                    0.00%  8.6020us         8  1.0750us     540ns  1.6570us  cuDeviceGet
                    0.00%  5.8370us         3  1.9450us     443ns  3.7760us  cuDeviceGetCount
                    0.00%  2.7590us         4     689ns     600ns     843ns  cuDeviceGetUuid