Cuda matrixMulCUBLAS.cpp中的几个cublasSgemm调用

Cuda matrixMulCUBLAS.cpp中的几个cublasSgemm调用,cuda,Cuda,我对marixMulCUBLAS.cppCUDA-9.0代码有疑问 在这段代码的matrixMultiply函数中,有几个CubClassGemm调用来计算矩阵乘法 但是,我不知道为什么这些电话要打几次。看起来没用 cublasSgemm预热调用和循环中的cublasSgemm调用之间有什么区别 此外,为什么cublasSgemm调用中不存在归纳变量j int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &ma

我对marixMulCUBLAS.cppCUDA-9.0代码有疑问

在这段代码的matrixMultiply函数中,有几个CubClassGemm调用来计算矩阵乘法

但是,我不知道为什么这些电话要打几次。看起来没用

cublasSgemm预热调用和循环中的cublasSgemm调用之间有什么区别

此外,为什么cublasSgemm调用中不存在归纳变量j

int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;

checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

int block_size = 32;

// set seed for rand()
srand(2006);

// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);

// set seed for rand()
srand(2006);

// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;

// allocate host memory for the result
float *h_C      = (float *) malloc(mem_size_C);
float *h_CUBLAS = (float *) malloc(mem_size_C);

checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

// create and start timer
printf("Computing result using CUBLAS...");

// execute the kernel
int nIter = 30;

// CUBLAS version 2.0
{
    const float alpha = 1.0f;
    const float beta  = 0.0f;
    cublasHandle_t handle;
    cudaEvent_t start, stop;

    checkCudaErrors(cublasCreate(&handle));

    //Perform warmup operation with cublas
    **checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));**

    // Allocate CUDA events that we'll use for timing
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // Record the start event
    checkCudaErrors(cudaEventRecord(start, NULL));

    **for (int j = 0; j < nIter; j++)
    {
        //note cublas is column primary!
        //need to transpose the order
        checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
    }**

    printf("done.\n");
但是,我不知道为什么这些电话要打几次。看起来没用

该代码是基准测试代码,多个调用只是为了能够计算CUBLAS-gemm调用多次执行的平均运行时间

cublasSgemm预热调用和循环中的cublasSgemm调用之间有什么区别

热身运动正是它的名字所暗示的。对CUDA代码的第一次调用可能会导致与CUDA运行时API的延迟上下文建立机制相关的额外一次性延迟。通过首先运行预热呼叫,可以从性能度量中删除此延迟

此外,为什么cublasSgemm调用中不存在归纳变量j

int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size)
{
cudaDeviceProp deviceProp;

checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));

int block_size = 32;

// set seed for rand()
srand(2006);

// allocate host memory for matrices A and B
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = (float *)malloc(mem_size_A);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = (float *)malloc(mem_size_B);

// set seed for rand()
srand(2006);

// initialize host memory
randomInit(h_A, size_A);
randomInit(h_B, size_B);

// allocate device memory
float *d_A, *d_B, *d_C;
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(float) * size_C;

// allocate host memory for the result
float *h_C      = (float *) malloc(mem_size_C);
float *h_CUBLAS = (float *) malloc(mem_size_C);

checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A));
checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B));
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C));

// setup execution parameters
dim3 threads(block_size, block_size);
dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y);

// create and start timer
printf("Computing result using CUBLAS...");

// execute the kernel
int nIter = 30;

// CUBLAS version 2.0
{
    const float alpha = 1.0f;
    const float beta  = 0.0f;
    cublasHandle_t handle;
    cudaEvent_t start, stop;

    checkCudaErrors(cublasCreate(&handle));

    //Perform warmup operation with cublas
    **checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));**

    // Allocate CUDA events that we'll use for timing
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    // Record the start event
    checkCudaErrors(cudaEventRecord(start, NULL));

    **for (int j = 0; j < nIter; j++)
    {
        //note cublas is column primary!
        //need to transpose the order
        checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
    }**

    printf("done.\n");
因为循环的唯一目的是允许调用运行多次以获得平均性能度量