Time 测量CUDA程序和CUDA内核的运行时间时出现问题_Time_Cuda

Time 测量CUDA程序和CUDA内核的运行时间时出现问题

time cuda

Time 测量CUDA程序和CUDA内核的运行时间时出现问题,time,cuda,Time,Cuda,目前，我有三种测量运行时间的方法，两种使用CUDA事件，另一种记录开始和结束UNIX。使用CUDA事件的方法测量两件事，一个测量整个外部循环时间，另一个测量所有内核执行时间的总和代码如下： int64 x1, x2; cudaEvent_t start; cudaEvent_t end; cudaEvent_t s1, s2; float timeValue; #define timer_s cudaEventRecord(start, 0); #define timer_e cud

目前，我有三种测量运行时间的方法，两种使用CUDA事件，另一种记录开始和结束UNIX。使用CUDA事件的方法测量两件事，一个测量整个外部循环时间，另一个测量所有内核执行时间的总和

代码如下：

int64 x1, x2;

cudaEvent_t start;
cudaEvent_t end;
cudaEvent_t s1, s2;
float timeValue;


 #define timer_s cudaEventRecord(start, 0);
 #define timer_e cudaEventRecord(end, 0);   cudaEventSynchronize(end); cudaEventElapsedTime( &timeValue, start, end ); printf("time:  %f  ms \n", timeValue);


cudaEventCreate( &start );
cudaEventCreate( &end );
cudaEventCreate( &s1 );
cudaEventCreate( &s2 );

cudaEventRecord(s1, 0);   
x1 = GetTimeMs64();

for(int r = 0 ; r < 2 ; r++)
{
    timer_s
    kernel1<<<1, x>>>(gl_devdata_ptr);
    cudaThreadSynchronize();
    timer_e
    sum += timeValue;

    for(int j = 0 ; j < 5; j++)
    {
        timer_s
        kernel2<<<1,x>>>(gl_devdata_ptr);
        cudaThreadSynchronize();
        timer_e
        sum += timeValue;

        timer_s
        kernel3<<<1,x>>>(gl_devdata_ptr);
        cudaThreadSynchronize();
        timer_e
        sum += timeValue;
    }

    timer_s
    kernel4<<<y, x>>> (gl_devdata_ptr);
    cudaThreadSynchronize();
    timer_e
    sum += timeValue;
}

x2 = GetTimeMs64();

cudaEventRecord(s2, 0);   
cudaEventSynchronize(s2); 
cudaEventElapsedTime( &timeValue, s1, s2 ); 
printf("elapsed cuda :       %f  ms \n", timeValue);
printf("elapsed sum :       %f  ms \n", sum);
printf("elapsed win :       %d  ms \n", x2-x1);

这些不是真正的变量名，也不是正确的内核名，我只是删除了一些以使代码更小

所以问题是，每种测量方法都给了我不同的总时间

我刚才运行的一些示例：

elapsed cuda : 21.076832    
elapsed sum :  4.177984     
elapsed win :  27

那么为什么会有如此巨大的差异呢？所有内核调用的总和约为4毫秒，其他18毫秒在哪里？CPU时间？

cudaThreadSynchronize是一个开销非常大的操作，因为它必须等待GPU上的所有工作完成

如果按照以下方式构造代码，则应得到正确的结果：

int64 x1, x2;

cudaEvent_t start;
cudaEvent_t end;
const int k_maxEvents = 5 + (2 * 2) + (2 * 5 * 2);
cudaEvent_t events[k_maxEvents];
int eIdx = 0;
float timeValue;

for (int e = 0; e < 5; ++e)
{
    cudaEventCreate(&events[e]);
}

x1 = GetTimeMs64();
cudaEventRecord(events[eIdx++], 0);       
for(int r = 0 ; r < 2 ; r++)
{
    cudaEventRecord(events[eIdx++], 0);
    kernel1<<<1, x>>>(gl_devdata_ptr);

    for(int j = 0 ; j < 5; j++)
    {
        cudaEventRecord(events[eIdx++], 0);
        kernel2<<<1,x>>>(gl_devdata_ptr);

        cudaEventRecord(events[eIdx++], 0);
        kernel3<<<1,x>>>(gl_devdata_ptr);
    }

    cudaEventRecord(events[eIdx++], 0);
    kernel4<<<y, x>>> (gl_devdata_ptr);
}

cudaEventRecord(eIdx++, 0);   
cudaDeviceSynchronize(); 

x2 = GetTimeMs64();

cudaEventElapsedTime( &timeValue, events[0], events[k_maxEvents - 1] ); 
printf("elapsed cuda :       %f  ms \n", timeValue);
// TODO the time between each events is the time to execute each kernel.
// On WDDM a context switch may occur between any of the kernels leading
// to higher than expected results.
// printf("elapsed sum :       %f  ms \n", sum);
printf("elapsed win :       %d  ms \n", x2-x1);

int64-x1，x2；
cudaEvent\u t启动；
cudaEvent_t end；
常量int k_maxEvents=5+（2*2）+（2*5*2）；
cudaEvent_t事件[k_maxEvents]；
int-eIdx=0；
浮动时间值；
对于（int e=0；e<5；++e）
{
cudaEventCreate（&events[e]）；
}
x1=GetTimeMs64（）；
cudaEventRecord（事件[eIdx++]，0）；
对于（int r=0；r<2；r++）
{
cudaEventRecord（事件[eIdx++]，0）；
内核1（gl_devdata_ptr）；
对于（int j=0；j<5；j++）
{
cudaEventRecord（事件[eIdx++]，0）；
内核2（gl_devdata_ptr）；
cudaEventRecord（事件[eIdx++]，0）；
内核3（gl_devdata_ptr）；
}
cudaEventRecord（事件[eIdx++]，0）；
内核4（gl_devdata_ptr）；
}
cudaEventRecord（eIdx++，0）；
cudaDeviceSynchronize（）；
x2=GetTimeMs64（）；
CudaEventReleasedTime（&timeValue，事件[0]，事件[k_maxEvents-1]）；
printf（“已用cuda:%f ms\n”，时间值）；
//TODO每个事件之间的时间是执行每个内核的时间。
//在WDDM上，上下文切换可能发生在任何内核之间
//以高于预期的结果。
//printf（“已用总和：%f ms\n”，总和）；
printf（“经过的胜利：%d毫秒\n”，x2-x1）；

在Windows上，一种更容易测量时间的方法是使用QueryPerformanceCounter和QueryPerformanceFrequency

如果您编写上述示例时没有将事件作为

#include "NvToolsExt.h"
nvtxRangePushA("CPU Time");
for(int r = 0 ; r < 2 ; r++)
{
    kernel1<<<1, x>>>(gl_devdata_ptr);

    for(int j = 0 ; j < 5; j++)
    {
        kernel2<<<1,x>>>(gl_devdata_ptr); 
        kernel3<<<1,x>>>(gl_devdata_ptr);
    }
    kernel4<<<y, x>>> (gl_devdata_ptr);
}

cudaDeviceSynchronize(); 
nvtxRangePop();

#包括“NvToolsExt.h”
nvtxRangePushA（“CPU时间”）；
对于（int r=0；r<2；r++）
{
内核1（gl_devdata_ptr）；
对于（int j=0；j<5；j++）
{
内核2（gl_devdata_ptr）；
内核3（gl_devdata_ptr）；
}
内核4（gl_devdata_ptr）；
}
cudaDeviceSynchronize（）；
nvtxRangePop（）；

并在Nsight Visual Studio Edition 1.5-2.2 CUDA跟踪活动或Visual Profiler 4.0+中运行，所有时间都可用。GPU时间将比使用cudaEvents API收集的时间更精确。使用nvtxRangePush测量CPU时间范围是可选的。这也可以通过测量从示例中的第一个CUDA API到cudaDeviceSynchronize的结尾来实现。

阅读您的答案后，我尝试了QueryPerformanceCounter，它工作了，从这里获得了函数：谢谢。

#include "NvToolsExt.h"
nvtxRangePushA("CPU Time");
for(int r = 0 ; r < 2 ; r++)
{
    kernel1<<<1, x>>>(gl_devdata_ptr);

    for(int j = 0 ; j < 5; j++)
    {
        kernel2<<<1,x>>>(gl_devdata_ptr); 
        kernel3<<<1,x>>>(gl_devdata_ptr);
    }
    kernel4<<<y, x>>> (gl_devdata_ptr);
}

cudaDeviceSynchronize(); 
nvtxRangePop();