Cuda 奇怪的cuBLAS gemm批量性能_Cuda_Gpu_Gpgpu_Cublas

Cuda 奇怪的cuBLAS gemm批量性能

cuda

Cuda 奇怪的cuBLAS gemm批量性能,cuda,gpu,gpgpu,cublas,Cuda,Gpu,Gpgpu,Cublas,我注意到CublashGemmsTridedBatch的一些奇怪的性能，我正在寻找解释。矩阵大小固定为20x20。以下是一些不同批量大小的计时（仅乘法，无数据传输）：批次=100，时间=0.2毫秒批次=1000，时间=1.9毫秒批次=10000，时间=18.3毫秒批次=100000，时间=5.3毫秒批次=1000000，时间=52.8毫秒前几个批量大小与我预期的一样，随着批量大小增加十倍，时间线性增加。但是，使用100000个矩阵时，会突然出现3.4倍的加速如果矩阵大小固定为1

我注意到CublashGemmsTridedBatch的一些奇怪的性能，我正在寻找解释。矩阵大小固定为20x20。以下是一些不同批量大小的计时（仅乘法，无数据传输）：

批次=100，时间=0.2毫秒
批次=1000，时间=1.9毫秒
批次=10000，时间=18.3毫秒
批次=100000，时间=5.3毫秒
批次=1000000，时间=52.8毫秒

前几个批量大小与我预期的一样，随着批量大小增加十倍，时间线性增加。但是，使用100000个矩阵时，会突然出现3.4倍的加速

如果矩阵大小固定为10x10，并且再次执行试验，我发现：

批次=100，时间=0.2毫秒
批次=1000，时间=2.0毫秒
批次=10000，时间=20.0毫秒
批次=100000，时间=0.9毫秒
批次=1000000，时间=8.9毫秒

同样，在100000批次的情况下，惊人的速度提高了22倍？我想知道为什么批量大小为1000和10000比批量大小100000慢，因为矩阵大小仍然是10x10

不同的批量大小是否使用不同的算法？我觉得这种表演很奇怪。当我用CublashGemBatched进行这项试验时，也会出现类似的结果。这些试验在GeForce GTX 1080 Ti上进行。授予最小工作代码：

#include <stdio.h>
#include <stdlib.h>
#include "math.h"
#include "cublas_v2.h" 
//nvcc -lcublas cublas.c -o cublas.out

int main(int argc, char* argv[])
{
int i,j,k,index;

// Linear dimension of matrices
int dim = 20;
int batch_count = 10*10*10*10*10*1;
// Allocate host storage for batch_count A,B,C square matrices
float* h_A = malloc(sizeof(float) * dim * dim * batch_count);
float* h_B = malloc(sizeof(float) * dim * dim * batch_count);
float* h_C = malloc(sizeof(float) * dim * dim * batch_count);
    for(k=0; k<batch_count; k++) {
        for(j=0; j<dim; j++) {
                for(i=0; i<dim; i++) {
                index = i*dim + j + k*dim*dim;
                  h_A[index] = index*index + 0.0f;
                  h_B[index] = index + 1.0f;
                  h_C[index] = 0.0f;
        }
    }
}


float *d_A, *d_B, *d_C;
cudaMalloc(&d_A, sizeof(float) * dim * dim * batch_count);
cudaMalloc(&d_B, sizeof(float) * dim * dim * batch_count);
cudaMalloc(&d_C, sizeof(float) * dim * dim * batch_count);
cudaMemcpy(h_A,d_A,sizeof(float) * dim * dim * batch_count,cudaMemcpyDeviceToHost);
cudaMemcpy(h_B,d_B,sizeof(float) * dim * dim * batch_count,cudaMemcpyDeviceToHost);
cudaMemcpy(h_C,d_C,sizeof(float) * dim * dim * batch_count,cudaMemcpyDeviceToHost);

cublasHandle_t handle;
cublasCreate(&handle);

// Do the actual multiplication 
float time_cuda_event;
cudaEvent_t start, stop;    
cudaEventCreate(&start);
cudaEventCreate(&stop) ;
cudaEventRecord(start, 0);
float alpha = 1.0f;  float beta = 1.0f;
cublasSgemmStridedBatched(handle,
                              CUBLAS_OP_N, 
                              CUBLAS_OP_N,
                              dim, dim, dim,
                              &alpha,
                              (const float*)d_A, dim,
                              dim*dim,
                              (const float*)d_B, dim,
                              dim*dim,
                              &beta,
                              d_C, dim, 
                              dim*dim, 
                              batch_count);
( cudaEventRecord(stop, 0) );
( cudaEventSynchronize(stop) );
( cudaEventElapsedTime(&time_cuda_event, start, stop) );              
printf("Time :  %3.1f ms \n", time_cuda_event);  

cudaMemcpy(h_C,d_C,sizeof(float) * dim * dim * batch_count,cudaMemcpyDeviceToHost);
// Destroy the handle
cublasDestroy(handle);


cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
    return 0;
}

#包括
#包括
#包括“math.h”
#包括“cublas_v2.h”
//nvcc-lcublas cublas.c-o cublas.out
int main（int argc，char*argv[]）
{
int i，j，k，索引；
//矩阵的线性维数
int dim=20；
整批计数=10*10*10*10*10*1；
//为批处理计数A、B、C平方矩阵分配主机存储
浮点数*h_A=malloc（浮点数）*尺寸*dim*批量计数）；
浮点数*h_B=malloc（浮点数）*分位数*分位数*批次计数）；
浮点数*h_C=malloc（浮点数）*尺寸*尺寸*批量计数）；
对于（k=0；k而言，这似乎只是CUBLAS内部启发式的结果。如果我运行修改（且工作）版本的代码，我会得到5x5情况下的这些计时：
Batch size :           10   Time :  0.019104 ms 
Batch size :          100   Time :  0.038304 ms 
Batch size :         1000   Time :  0.163520 ms 
Batch size :        10000   Time :  1.410944 ms 
Batch size :       100000   Time :  1.614144 ms 
Batch size :      1000000   Time :  16.057407 ms 

分析表明，在多达10000个条目的批次的情况下，库运行一个内核：
1.10759s  16.831us             (1 1 10)       (128 1 1)       120  12.250KB        0B         -           -           -           -  GeForce GTX 970         1         7  maxwell_sgemm_128x64_nn [3939]
1.10766s  19.168us            (1 1 100)       (128 1 1)       120  12.250KB        0B         -           -           -           -  GeForce GTX 970         1         7  maxwell_sgemm_128x64_nn [3971]
1.10773s  147.71us           (1 1 1000)       (128 1 1)       120  12.250KB        0B         -           -           -           -  GeForce GTX 970         1         7  maxwell_sgemm_128x64_nn [4003]
1.10791s  1.4064ms          (1 1 10000)       (128 1 1)       120  12.250KB        0B         -           -           -           -  GeForce GTX 970         1         7  maxwell_sgemm_128x64_nn [4035]

在较大的规模下，它会运行多个对另一个内核的调用，以服务于该调用：
1.10935s  1.1518ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4063]
1.11050s  606.54us          (1 1 34465)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4087]
1.11113s  1.1498ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4115]
1.11228s  1.1501ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4139]
1.11344s  1.1511ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4163]
1.11459s  1.1494ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4187]
1.11574s  1.1507ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4211]
1.11689s  1.1503ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4235]
1.11804s  1.1499ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4259]
1.11919s  1.1507ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4283]
1.12035s  1.1507ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4307]
1.12150s  1.1509ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4331]
1.12265s  1.1489ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4355]
1.12380s  1.1496ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4379]
1.12495s  1.1500ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4403]
1.12610s  1.1494ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4427]
1.12726s  1.1503ms          (1 1 65535)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4451]
1.12841s  299.35us          (1 1 16975)       (16 16 1)        31  2.1250KB        0B         -           -           -           -  GeForce GTX 970         1         7  void batch_gemm_kernel1x1_core<float, float, float, bool=0, bool=0, bool=0, bool=0, bool=0, bool=1, bool=1>(float* const *, float const * const *, float const * const *, float*, float const *, float const *, int, int, int, int, int, int, __int64, __int64, __int64, float const *, float const *, float, float, int, int) [4475]

1.10935s1.1518ms（1165535）（16161）312.1250KB 0B--GeForce GTX 97017无效批次gemm内核1x1内核（浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，【4063】
1.11050s 606.54us（1 1 34465）（16 16 1）31 2.1250KB 0B----GeForce GTX 970 1 7无效批次gemm内核1x1内核（浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，64，整数，整数，64，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数）[4087]
1.11113s 1.1498ms（1 1 65535）（16 1）31 2.1250KB 0B--GeForce GTX 970 1 7无效批次gemm内核1x1芯（浮点数*常数*、浮点数*常数*、浮点数*常数*、浮点数*、浮点数*常数*、浮点数*常数*、int、int、int、int、int、int、u int64、u int64、u int64、浮点数*常数*、浮点数、浮点数、int、int）[4115]
1.11228s 1.1501ms（1 1 65535）（16 1）31 2.1250KB 0B--GeForce GTX 970 1 7无效批次gemm内核1x1芯（浮点数*常数*、浮点数*常数*、浮点数*常数*、浮点数*、浮点数*常数*、浮点数*常数*、int、int、int、int、int、int、u int64、u int64、u int64、浮点数*常数*、浮点数、浮点、int、int）[4139]
1.11344s 1.1511ms（1 1 65535）（16 1）31 2.1250KB 0B--GeForce GTX 970 1 7无效批次gemm内核1x1芯（浮点*常数*，浮点*常数*，浮点*常数*，浮点*，浮点*常数*，浮点*常数*，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，整型，【4163】
1.11459s1.1494ms（1165535）（16161）312.1250KB 0B--GeForce GTX 97017无效批次gemm内核1x1内核（浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，浮点*常数*，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数，64，整数，整数，64，整数，整数，整数，整数，整数，整数，整数，整数，整数，整数
1.11574s 1.1507ms（1 1 65535）（16 1）31 2.1250KB 0B--GeForce GTX 970 1 7无效批次gemm内核1x1芯（浮点数*常数*、浮点数*常数*、浮点数*常数*、浮点数*、浮点数*常数*、浮点数*常数*、int、int、int、int、int、int、u int64、u int64、u int64、浮点数*常数*、浮点数、浮点、int、int）[4211]
1.11689s1.1503ms（16165535）（16161）312.1250KB 0B--GeForce GTX 970 1 7无效批次gemm内核1x1芯（浮点数*常数*、浮点数*常数*、浮点数*常数*、浮点数*、浮点数*常数*、浮点数*常数*、int、int、int、int、int、int、u int64、u int64、u int64、浮点数*常数*、浮点数、浮点数、int、int）[4235]
1.11804s 1.1499ms（1 1 65535）（16 16 1）31 2.1250KB 0B--GeForce GTX 970 1 7无效批处理内核1x1内核（浮点*常量*，浮点常量*常量*，浮点常量*常量*，浮点*，浮点常量*，浮点常量*，浮点常量*，浮点常量*，浮点常量*，int，int，int，int，i