Cuda CUBLAS gemm中的memset始终以默认流启动

Cuda CUBLAS gemm中的memset始终以默认流启动,cuda,cublas,Cuda,Cublas,我注意到,当从主机为每次gemm调用调用cublasSgemm函数时,有3个内核调用:memset、scal_内核和gemm内核本身(例如sgemm_large)。即使我使用设备内存中分配的alpha/beta常量,也会发生这种情况。虽然memset和scal_内核的开销相对较小,但问题是memset总是在默认流中启动,这会导致不必要的同步 守则: __constant__ __device__ float alpha = 1; __constant__ __device__ float bet

我注意到,当从主机为每次gemm调用调用cublasSgemm函数时,有3个内核调用:memset、scal_内核和gemm内核本身(例如sgemm_large)。即使我使用设备内存中分配的alpha/beta常量,也会发生这种情况。虽然memset和scal_内核的开销相对较小,但问题是memset总是在默认流中启动,这会导致不必要的同步

守则:

__constant__ __device__ float alpha = 1;
__constant__ __device__ float beta = 1;

int main()
{
    // ... memory allocation skipped ...
    float* px = thrust::raw_pointer_cast(x.data());
    float* py = thrust::raw_pointer_cast(y.data());
    float* pmat = thrust::raw_pointer_cast(mat.data());
    for (int iter = 0; iter < 3; ++iter)
    {
        cbstatus = cublasSgemm(cbh, CUBLAS_OP_N, CUBLAS_OP_N, crow, ccol, cshared, &alpha, px, crow, py, cshared, &beta, pmat, crow);
        assert(0 == cbstatus);
    }
}
\uuuuuu常量\uuuuuuuu设备\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
__常数\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
int main()
{
//…已跳过内存分配。。。
float*px=推力::原始指针(x.data());
float*py=推力::原始指针(y.data());
float*pmat=推力::原始指针(mat.data());
用于(国际热核试验堆=0;国际热核试验堆<3;++iter)
{
cbstatus=cublasSgemm(cbh、CUBLAS_OP_N、CUBLAS_OP_N、crow、ccol、cshared和alpha、px、crow、py、cshared和beta、pmat、crow);
断言(0==cbstatus);
}
}
这是我在profiler中看到的:

问题是:有没有办法避免memset,或者让它在分配给CUBLAS handle的流中运行?
一个想法是使用DP并运行gemm函数的设备版本,但这只适用于CC 3.0及更高版本。

请尝试下面的代码。除了不可避免的内存分配和拷贝之外,该代码仅具有
cublasSgemm
调用。你会看到的

  • 您只启动了一个内核(
    gemm_kernel1x1_core
  • cublasSgemm
    的两个调用在两个不同的流中完美运行
  • 图中显示了可视化探查器时间线

    我的系统:GeForce 540M、Windows 7、CUDA 5.5

    #include <conio.h>
    #include <stdio.h>
    #include <assert.h>
    
    #include <cublas_v2.h> 
    
    /********************/
    /* CUDA ERROR CHECK */
    /********************/
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) { getchar(); exit(code); }
        }
    }
    
    /**********************/
    /* cuBLAS ERROR CHECK */
    /**********************/
    #ifndef cublasSafeCall
    #define cublasSafeCall(err)     __cublasSafeCall(err, __FILE__, __LINE__)
    #endif
    
    inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line)
    {
        if( CUBLAS_STATUS_SUCCESS != err) {
            fprintf(stderr, "CUBLAS error in file '%s', line %d\n \nerror %d \nterminating!\n",__FILE__, __LINE__,err); 
            getch(); cudaDeviceReset(); assert(0); 
        }
    }
    
    /********/
    /* MAIN */
    /********/
    int main()
    {
        int N = 5;
    
        float *A1, *A2, *B1, *B2, *C1, *C2;
        float *d_A1, *d_A2, *d_B1, *d_B2, *d_C1, *d_C2;
    
        A1 = (float*)malloc(N*N*sizeof(float));
        B1 = (float*)malloc(N*N*sizeof(float));
        C1 = (float*)malloc(N*N*sizeof(float));
    
        A2 = (float*)malloc(N*N*sizeof(float));
        B2 = (float*)malloc(N*N*sizeof(float));
        C2 = (float*)malloc(N*N*sizeof(float));
    
        gpuErrchk(cudaMalloc((void**)&d_A1,N*N*sizeof(float)));
        gpuErrchk(cudaMalloc((void**)&d_B1,N*N*sizeof(float)));
        gpuErrchk(cudaMalloc((void**)&d_C1,N*N*sizeof(float)));
        gpuErrchk(cudaMalloc((void**)&d_A2,N*N*sizeof(float)));
        gpuErrchk(cudaMalloc((void**)&d_B2,N*N*sizeof(float)));
        gpuErrchk(cudaMalloc((void**)&d_C2,N*N*sizeof(float)));
    
        for (int i=0; i<N*N; i++) {
            A1[i] = ((float)rand()/(float)RAND_MAX);
            A2[i] = ((float)rand()/(float)RAND_MAX);
            B1[i] = ((float)rand()/(float)RAND_MAX);
            B2[i] = ((float)rand()/(float)RAND_MAX);
        }
        gpuErrchk(cudaMemcpy(d_A1, A1, N*N*sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_B1, B1, N*N*sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_A2, A2, N*N*sizeof(float), cudaMemcpyHostToDevice));
        gpuErrchk(cudaMemcpy(d_B2, B2, N*N*sizeof(float), cudaMemcpyHostToDevice));
    
        cublasHandle_t handle;
        cublasSafeCall(cublasCreate(&handle));
    
        cudaStream_t stream1, stream2;
        gpuErrchk(cudaStreamCreate(&stream1));
        gpuErrchk(cudaStreamCreate(&stream2));
    
        float alpha = 1.f;
        float beta = 1.f;
    
        cublasSafeCall(cublasSetStream(handle,stream1));
        cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A1, N, d_B1, N, &beta, d_C1, N));
        cublasSafeCall(cublasSetStream(handle,stream2));
        cublasSafeCall(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A2, N, d_B2, N, &beta, d_C2, N));
    
        gpuErrchk(cudaDeviceReset());
    
        return 0;
    
     }
    

    #包括
    #包括
    #包括
    #包括
    /********************/
    /*CUDA错误检查*/
    /********************/
    #定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
    内联void gpuAssert(cudaError\u t代码,char*文件,int行,bool abort=true)
    {
    如果(代码!=cudaSuccess)
    {
    fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
    if(abort){getchar();exit(code);}
    }
    }
    /**********************/
    /*库布拉斯错误检查*/
    /**********************/
    #ifndef cublasSafeCall
    #定义cublasSafeCall(err)uu cublasSafeCall(err,u u文件,_u行_;)
    #恩迪夫
    内联void\uuu cublasSafeCall(cublasStatus\u t err、const char*文件、const int line)
    {
    if(CUBLAS\u STATUS\u SUCCESS!=错误){
    fprintf(stderr,“文件“%s”中的CUBLAS错误,第%d行\n\n错误%d\n终止!\n“,\uuuuu文件,\uuuu行\uuuu,错误);
    getch();cudaDeviceReset();断言(0);
    }
    }
    /********/
    /*主要*/
    /********/
    int main()
    {
    int N=5;
    浮点数*A1、*A2、*B1、*B2、*C1、*C2;
    浮点数*d_A1、*d_A2、*d_B1、*d_B2、*d_C1、*d_C2;
    A1=(浮动*)malloc(N*N*sizeof(浮动));
    B1=(浮动*)malloc(N*N*sizeof(浮动));
    C1=(浮动*)malloc(N*N*sizeof(浮动));
    A2=(浮动*)malloc(N*N*sizeof(浮动));
    B2=(浮动*)malloc(N*N*sizeof(浮动));
    C2=(浮动*)malloc(N*N*sizeof(浮动));
    gpuErrchk(cudaMalloc(void**)和d_A1,N*N*sizeof(float));
    gpuErrchk(cudaMalloc(void**)和d_B1,N*N*sizeof(float));
    gpuErrchk(cudaMalloc(void**)和d_C1,N*N*sizeof(float));
    gpuErrchk(cudaMalloc(void**)和d_A2,N*N*sizeof(float));
    gpuErrchk(cudaMalloc(void**)和d_B2,N*N*sizeof(float));
    gpuErrchk(cudaMalloc(void**)和d_C2,N*N*sizeof(float));
    
    对于(int i=0;i而言,CUBLAS5.5中存在一个缺陷,其中在k>>m,n的专用路径中使用了
    cudaMemset
    而不是
    cudaMemsetAsync

    它在CUBLAS6.0RC中是固定的。如果您是注册开发者,您可以访问它

    顺便说一句,我想知道你为什么用
    \uuuuuuu常量\uuuuuuuuuuuuuuuuuuuuuuuu设备\uuuuuuuuuuu
    来表示alpha和beta。 您正在使用
    pointerMode=DEVICE


    如果没有,你可以在主机上使用alpha和beta。

    你怎么说
    memset
    总是在默认流中运行?在调用
    cublasSetStream
    之前,我在你的代码中没有看到任何
    cublasSetStream
    。谢谢,这是我迁移到6.0的另一个原因。正如Philippe指出的,问题是CUBLAS中的一个bug5.5当共享维度远大于行/列dims时会发生这种情况。如果将k设置为10000,将m,n设置为1000,则您将看到我描述的问题。很抱歉,在我的问题中没有提供完整的信息。