使用cudaEvent****（）与clock_gettime（）进行计时_Cuda_Timing

使用cudaEvent****（）与clock_gettime（）进行计时

cuda

使用cudaEvent****（）与clock_gettime（）进行计时,cuda,timing,Cuda,Timing,我正在尝试计时我的代码。我被告知cudaEvent****（）可以应用。同时，我的原始代码使用clock_gettime（）来计时。我打印cudaEvent****（）和clock_gettime（）测量的结果，如下所示。这就是我真正困惑的地方由cudaEvent****（）测量初始数据结构：1971.517578ms 建立上下文：0.007296ms 重新排列数据：234.271423ms 复制数据：53.402176毫秒时间步长：17221.333984ms 通过时钟_gettim

我正在尝试计时我的代码。我被告知cudaEvent****（）可以应用。同时，我的原始代码使用clock_gettime（）来计时。我打印cudaEvent****（）和clock_gettime（）测量的结果，如下所示。这就是我真正困惑的地方

由cudaEvent****（）测量

初始数据结构：1971.517578ms
建立上下文：0.007296ms
重新排列数据：234.271423ms
复制数据：53.402176毫秒
时间步长：17221.333984ms

通过时钟_gettime（）测量

初始数据结构：1.802874s
建立上下文：20.541891s
重新排列数据：0.235464s
复制数据：0.051851s
时间步长：8.429955s

注:

初始化数据结构：完全在CPU上工作
建立上下文：仅一行：cudaFree（（void*）0
重新安排数据：完全在CPU上工作
复制数据：将数据从主机传输到设备
时间步进：涉及两个内核函数

Q1:cudaEvent****（）（0.0072ms）测量的“建立上下文”所花费的时间与clock_gettime（）测量的时间（约20.5s）相差很大。实际上，这一部分只有一行建立上下文。 cudaFree（0）这种巨大的差异是如何发生的

Q2:cudaEvent****（）（~17.221s）测量的“时间步进”所花费的时间是clock_gettime（）测量的时间（约8.43s）的两倍。有人告诉我异步化可能是一个原因，但我真的不明白。有人能帮我度过难关吗

问题3：所用的挂钟时间与clock_gettime（）测量的时间非常接近。但是，我听说cudaEvent****（）在计时cuda代码时更可取。我不知道该选哪一个

===========================================更新=================================== 下面是我的代码的一部分，其中定义了一些计时函数和宏

#define TIMING 1
#if TIMING
double get_time()
{
    struct timespec time;
    clock_gettime(CLOCK_REALTIME, &time);
    return (double)time.tv_sec + (double)time.tv_nsec * 1.0e-9 ;
}
#endif
#define CUDATIMING 0
#if CUDATIMING
#define cuda_timing_init \
    cudaEvent_t startEvent, stopEvent;\
    float timeEvent;\
    cudaEventCreate(&startEvent);\
    cudaEventCreate(&stopEvent);
#define cuda_timing_begin \
    cudaEventRecord(startEvent, 0);
#define cuda_timing_stop(str) \
    cudaEventRecord(stopEvent, 0);\
    cudaEventSynchronize(stopEvent);\
    cudaEventElapsedTime(&timeEvent, startEvent, stopEvent);\
    printf("time spent of %s: %fms\n", str, timeEvent);
#define cuda_timing_destroy \
    cudaEventDestroy(startEvent);\
    cudaEventDestroy(stopEvent);
#endif

我使用这些函数和宏来计时

========================================更新20150823===============================

下面是我的代码的基本结构，包括计时。我不确定这是否有助于解决我的时间问题

void
copy_float_from_host_to_dev(float *h_p, float **d_pp, int size)
{
    if_error(cudaMalloc(d_pp, size));
    if_error(cudaMemcpy(*d_pp, h_p, size, cudaMemcpyHostToDevice));
}

void
copy_int_from_host_to_dev(int *h_p, int **d_pp, int size)
{
    if_error(cudaMalloc(d_pp, size));
    if_error(cudaMemcpy(*d_pp, h_p, size, cudaMemcpyHostToDevice));
}

int
main(int argc, char **argv)
{
    // init 
    // totally CPU codes        
    // ......
#if TIMING
    double t1, t2, t3, t4, t5, t6; 
    t1 = get_time();
#endif
#if CUDATIMING
    cuda_timing_init;
    cuda_timing_begin;
#endif
    // init data structure
    // totally CPU codes
    // ......
#if TIMING
    t2 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("init data structure");
    cuda_timing_begin;
#endif
    // establish context
    cudaFree((void*)0);
#if TIMING
    t3 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("establish context");
    cuda_timing_begin;
#endif
    // rearrange data
    // totally CPU codes
    // data on CPU side has different structure
    // compared to data on GPU side, so I need
    // to rearrange it.
    // ......
#if TIMING
    t4 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("rearrange data");
    cuda_timing_begin;
#endif
    // copy data from host to device
    // about 10 copies. the following are 2 of them
       // all use copy_float/int_from_host_to_dev 
    // h_lap --> d_lap
    copy_float_from_host_to_dev(h_lap, &d_lap, lapsize); 
    // h_etol --> d_etol
    copy_int_from_host_to_dev(h_etol, &d_etol, etolsize); 
    // ......
#if TIMING
    t5 = get_time();
#endif
#if CUDATIMING
    cuda_timing_stop("copy data");
    cuda_timing_begin;
#endif
    // time stepping
    for(step = 1; step < para->nstep; step++)
    {
    /* kernel_1: matrix-vector multiplication.
     * The matrix is special, so multiplication 
     * can be very fast.  
     * atomic operations are involved
     * no data transfers between host and device */
    kernel_1<<<32768, 128>>>(......);
    /* kernel_2: vector operations.
     * Assuming that a,b,c,d are vectors,
     * what kernel_2 does is: a=2*a-b+c*d 
     * no data transfers between host and device */
    kernel_2<<<16384, 128>>>(......);
    }
#if TIMING
    t6 = get_time();
    printf("total time: %fs\n", t6-t1);
    printf("  init data structure: %fs\n", t2-t1);
    printf("  establish context: %fs\n", t3-t2);
    printf("  rearrange data: %fs\n", t4-t3);
    printf("  copy data: %fs\n", t5-t4);
    printf("  time stepping: %fs\n", t6-t5);
#endif
#if CUDATIMING
    cuda_timing_stop("time stepping");
    cuda_timing_destroy;
#endif

    // destroy data structure
    // totally CPU codes
    // ......

    return 0;
}

void
将\u float\u从\u host\u复制到\u dev（float*h\u p，float**d\u pp，int size）
{
if_错误（Cudamaloc（d_pp，尺寸））；
if_错误（cudaMemcpy（*d_pp，h_p，size，cudamemcpyhostodevice））；
}
无效的
从主机复制到开发者（int*h\u p，int**d\u pp，int size）
{
if_错误（Cudamaloc（d_pp，尺寸））；
if_错误（cudaMemcpy（*d_pp，h_p，size，cudamemcpyhostodevice））；
}
int
主（内部argc，字符**argv）
{
//初始化
//完全是CPU代码
// ......
#如果定时
双t1，t2，t3，t4，t5，t6；
t1=获取时间（）；
#恩迪夫
#如果CUDATIMING
cuda_计时_init；
开始计时；
#恩迪夫
//初始化数据结构
//完全是CPU代码
// ......
#如果定时
t2=获取时间（）；
#恩迪夫
#如果CUDATIMING
cuda_定时_停止（“初始数据结构”）；
开始计时；
#恩迪夫
//建立上下文
cudaFree（（void*）0；
#如果定时
t3=获取时间（）；
#恩迪夫
#如果CUDATIMING
cuda_定时_停止（“建立上下文”）；
开始计时；
#恩迪夫
//重新排列数据
//完全是CPU代码
//CPU端的数据具有不同的结构
//与GPU端的数据相比，我需要
//重新安排它。
// ......
#如果定时
t4=获取时间（）；
#恩迪夫
#如果CUDATIMING
cuda_定时_停止（“重新排列数据”）；
开始计时；
#恩迪夫
//将数据从主机复制到设备
//大约10份。以下是其中的2份
//都使用从\u主机\u复制\u浮点/int\u到\u dev
//h_圈-->d_圈
将\u float\u从\u主机\u复制到\u dev（h\u圈和d\u圈，圈大小）；
//h_etol-->d_etol
从主机复制到开发者（h_etol和d_etol、etolsize）；
// ......
#如果定时
t5=获取时间（）；
#恩迪夫
#如果CUDATIMING
cuda_定时_停止（“复制数据”）；
开始计时；
#恩迪夫
//时间步进
对于（步骤=1；步骤nstep；步骤++）
{
/*内核_1：矩阵向量乘法。
*矩阵是特殊的，所以乘法
*可能会很快。
*涉及原子操作
*主机和设备之间无数据传输*/
内核_1（……）；
/*内核2：向量运算。
*假设a，b，c，d是向量，
*内核_2的作用是：a=2*a-b+c*d
*主机和设备之间无数据传输*/
核_2（……）；
}
#如果定时
t6=获取时间（）；
printf（“总时间：%fs\n”，t6-t1）；
printf（“初始数据结构：%fs\n”，t2-t1）；
printf（“建立上下文：%fs\n”，t3-t2）；
printf（“重新排列数据：%fs\n”，t4-t3）；
printf（“复制数据：%fs\n”，t5-t4）；
printf（“时间步进：%fs\n”，t6-t5）；
#恩迪夫
#如果CUDATIMING
cuda_定时_停止（“时间步进”）；
时间与破坏；
#恩迪夫
//破坏数据结构
//完全是CPU代码
// ......
返回0；
}

您只提供了一个代码示例，因此我只能提供一个答案：

cudaEvent****（）（0.0072ms）测量的“建立上下文”所花费的时间与clock_gettime（）测量的时间（约20.5s）相差很大。实际上，这一部分只有一行建立上下文。cudaFree（0）这种巨大的差异是如何发生的

您认为

cudaFree

调用建立CUDA上下文的假设是不正确的。延迟上下文的建立发生在第一个需要直接与上下文交互的调用中。在这种情况下，是您的事件计时代码在建立上下文，因此

cudaFree

调用基本上是免费的。这就是为什么两种计时方法之间有很大的挂钟时间差。

如果看不到代码，几乎不可能说。代码有点长，但我想我可以更新我的问题，向您展示如何计时“建立上下文”部分@Talonmies非常感谢！你真的解决了我的第一个问题！I h