Cuda 如何使用WMMA函数?;
我已经运行了,但是我得到了意想不到的结果。那么如何使用wmma函数呢?我的Cuda 如何使用WMMA函数?;,cuda,gpgpu,Cuda,Gpgpu,我已经运行了,但是我得到了意想不到的结果。那么如何使用wmma函数呢?我的wmma::load\u matrix\u sync错误吗?或者其他我们应该注意的事情 WMMA_M,WMMA_N,WMMA_K = 16 __global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size) { //Declare the fragment wmma::fragment<wmma::matrix_a, WMMA_M
wmma::load\u matrix\u sync
错误吗?或者其他我们应该注意的事情
WMMA_M,WMMA_N,WMMA_K = 16
__global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size)
{
//Declare the fragment
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,float> acc_frag;
//Load the matrix to fragment
wmma::load_matrix_sync(a_frag, a, WMMA_M);
wmma::load_matrix_sync(b_frag, b, WMMA_K);
//perform mma
wmma::fill_fragment(acc_frag, 0.0f);
for(int i=0; i<1e4; i++)
{
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
}
//store the result
wmma::store_matrix_sync(c, acc_frag, WMMA_M, wmma::mem_row_major);
}
WMMA_M,WMMA_N,WMMA_K=16
__全局无效wmma内核(半*a,半*b,浮点*c,整数矩阵大小)
{
//声明片段
wmma::片段a_frag;
wmma::片段b_frag;
wmma::碎片附件;
//将矩阵加载到片段
wmma::加载矩阵同步(a_frag,a,wmma_M);
wmma::加载矩阵同步(b_frag,b,wmma_K);
//表演mma
wmma::填充碎片(附件碎片,0.0f);
对于(int i=0;i
无法直接为主机上的半变量赋值
我建议切换到CUDA 10。它已使数据类型减少了一半
但是,无论是使用CUDA 9.2还是CUDA 10,以下示例都应同样适用:
$ cat t304.cu
#include <mma.h>
#include <iostream>
using namespace nvcuda;
__global__ void wmma_ker(half *a, half *b, float *c) {
// Declare the fragments
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
// Initialize the output to zero
wmma::fill_fragment(c_frag, 0.0f);
// Load the inputs
wmma::load_matrix_sync(a_frag, a, 16);
wmma::load_matrix_sync(b_frag, b, 16);
// Perform the matrix multiplication
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
// Store the output
wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}
int main(){
half *d_a, *h_a, *d_b, *h_b;
float *d_c, *h_c;
h_c = new float[16*16];
h_b = new half[16*16];
h_a = new half[16*16];
cudaMalloc(&d_a, 16*16*sizeof(half));
cudaMalloc(&d_b, 16*16*sizeof(half));
cudaMalloc(&d_c, 16*16*sizeof(float));
for (int i = 0; i < 16*16; i++) {
h_a[i] = 1.0f;
h_b[i] = 1.0f;}
cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice);
wmma_ker<<<1,32>>>(d_a, d_b, d_c);
cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ",";
std::cout << std::endl;
}
$ nvcc -arch=sm_70 -o t304 t304.cu
$ cuda-memcheck ./t304
========= CUDA-MEMCHECK
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
========= ERROR SUMMARY: 0 errors
$
对于已用时间测量,您的cudaEventSynchronize(stop)
和cudaEventRecord(stop)
顺序错误。好的,我修复了这个问题。无法直接为主机上的半变量赋值。在将矩阵从主机复制到设备时,请注意浮点和半变量的大小。
$ cat t304.cu
#include <mma.h>
#include <iostream>
using namespace nvcuda;
__global__ void wmma_ker(half *a, half *b, float *c) {
// Declare the fragments
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
// Initialize the output to zero
wmma::fill_fragment(c_frag, 0.0f);
// Load the inputs
wmma::load_matrix_sync(a_frag, a, 16);
wmma::load_matrix_sync(b_frag, b, 16);
// Perform the matrix multiplication
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
// Store the output
wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}
int main(){
half *d_a, *h_a, *d_b, *h_b;
float *d_c, *h_c;
h_c = new float[16*16];
h_b = new half[16*16];
h_a = new half[16*16];
cudaMalloc(&d_a, 16*16*sizeof(half));
cudaMalloc(&d_b, 16*16*sizeof(half));
cudaMalloc(&d_c, 16*16*sizeof(float));
for (int i = 0; i < 16*16; i++) {
h_a[i] = 1.0f;
h_b[i] = 1.0f;}
cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice);
wmma_ker<<<1,32>>>(d_a, d_b, d_c);
cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ",";
std::cout << std::endl;
}
$ nvcc -arch=sm_70 -o t304 t304.cu
$ cuda-memcheck ./t304
========= CUDA-MEMCHECK
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
========= ERROR SUMMARY: 0 errors
$
$ nvprof ./t304
==28135== NVPROF is profiling process 28135, command: ./t304
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
==28135== Profiling application: ./t304
==28135== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 42.97% 3.2320us 2 1.6160us 1.4080us 1.8240us [CUDA memcpy HtoD]
28.52% 2.1450us 1 2.1450us 2.1450us 2.1450us [CUDA memcpy DtoH]
28.51% 2.1440us 1 2.1440us 2.1440us 2.1440us wmma_ker(__half*, __half*, float*)
API calls: 98.42% 498.63ms 3 166.21ms 5.2170us 498.61ms cudaMalloc
1.06% 5.3834ms 384 14.019us 347ns 568.79us cuDeviceGetAttribute
0.38% 1.9473ms 4 486.83us 250.95us 1.1810ms cuDeviceTotalMem
0.10% 493.31us 4 123.33us 109.62us 140.63us cuDeviceGetName
0.01% 68.566us 1 68.566us 68.566us 68.566us cudaLaunchKernel
0.01% 67.104us 3 22.368us 9.6850us 30.563us cudaMemcpy
0.00% 22.628us 4 5.6570us 3.1910us 9.2200us cuDeviceGetPCIBusId
0.00% 8.6020us 8 1.0750us 540ns 1.6570us cuDeviceGet
0.00% 5.8370us 3 1.9450us 443ns 3.7760us cuDeviceGetCount
0.00% 2.7590us 4 689ns 600ns 843ns cuDeviceGetUuid