CUDA基准测试中的执行时间问题_Cuda

CUDA基准测试中的执行时间问题

cuda

CUDA基准测试中的执行时间问题,cuda,Cuda,我试图分析一些CUDA Rodinia基准测试，从SM和内存利用率、功耗等方面进行分析。为此，我同时执行基准测试和分析程序，这实际上产生了一个pthread，以使用NVML库分析GPU执行情况问题是，如果我不同时调用探查器，那么基准测试的执行时间要比使用探查器执行基准测试的时间高得多（大约3倍）。CPU的频率调节器是用户空间，所以我认为CPU的频率不会改变。这是因为GPU频率的闪烁吗？下面是探查器的代码 #include <pthread.h> #include <stdi

我试图分析一些CUDA Rodinia基准测试，从SM和内存利用率、功耗等方面进行分析。为此，我同时执行基准测试和分析程序，这实际上产生了一个pthread，以使用NVML库分析GPU执行情况

问题是，如果我不同时调用探查器，那么基准测试的执行时间要比使用探查器执行基准测试的时间高得多（大约3倍）。CPU的频率调节器是用户空间，所以我认为CPU的频率不会改变。这是因为GPU频率的闪烁吗？下面是探查器的代码

#include <pthread.h>
#include <stdio.h>
#include "nvml.h"
#include "unistd.h"
#define NUM_THREADS     1

void *PrintHello(void *threadid)
{
   long tid;
   tid = (long)threadid;
  // printf("Hello World! It's me, thread #%ld!\n", tid);

nvmlReturn_t result;
nvmlDevice_t device;
nvmlUtilization_t utilization;
nvmlClockType_t jok;
unsigned int device_count, i,powergpu,clo;
char version[80];
result = nvmlInit();
result = nvmlSystemGetDriverVersion(version,80);
printf("\n Driver version: %s \n\n", version);
result = nvmlDeviceGetCount(&device_count);
printf("Found %d device%s\n\n", device_count,
device_count != 1 ? "s" : "");
printf("Listing devices:\n");
result = nvmlDeviceGetHandleByIndex(0, &device);

while(1)

{
result = nvmlDeviceGetPowerUsage(device,&powergpu );
result = nvmlDeviceGetUtilizationRates(device, &utilization);
printf("\n%d\n",powergpu);




        if (result == NVML_SUCCESS)
        {
           printf("%d\n",  utilization.gpu);
           printf("%d\n",  utilization.memory);
        }
result=nvmlDeviceGetClockInfo(device,NVML_CLOCK_SM,&clo);
if(result==NVML_SUCCESS)
{
printf("%d\n",clo);
}
usleep(500000);
}


pthread_exit(NULL);
}

int main (int argc, char *argv[])
{
   pthread_t threads[NUM_THREADS];

int rc;
   long t;
   for(t=0; t<NUM_THREADS; t++){
      printf("In main: creating thread %ld\n", t);
      rc = pthread_create(&threads[t], NULL, PrintHello, (void *)t);
      if (rc){
         printf("ERROR; return code from pthread_create() is %d\n", rc);
         exit(-1);
      }
   }

   /* Last thing that main() should do */
   pthread_exit(NULL);

}

#包括
#包括
#包括“nvml.h”
#包括“unistd.h”
#定义NUM_线程1
void*PrintHello（void*threadid）
{
长tid；
tid=（长）线程ID；
//printf（“你好，世界！是我，线程#%ld！\n”，tid）；
nvml返回结果；
NVMLT装置；
NVMLUt利用率；
nvmlClockType_t jok；
无符号整数设备计数，i，powergpu，clo；
字符版本[80]；
结果=nvmlInit（）；
结果=nvmlSystemGetDriverVersion（版本，80）；
printf（“\n驱动程序版本：%s\n\n”，版本）；
结果=nvmlDeviceGetCount（&设备计数）；
printf（“找到%d个设备%s\n\n”，设备计数，
设备计数！=1？“s”：“）；
printf（“列出设备：\n”）；
结果=nvmlDeviceGetHandleByIndex（0，&设备）；
而(1)
{
结果=nvmlDeviceGetPowerUsage（设备和powergpu）；
结果=nvmlDeviceGetUtilizationRates（设备和利用率）；
printf（“\n%d\n”，powergpu）；
如果（结果==NVML\U成功）
{
printf（“%d\n”，utilization.gpu）；
printf（“%d\n”，利用率.memory）；
}
结果=nvmlDeviceGetClockInfo（设备、NVML时钟和clo）；
如果（结果==NVML\U成功）
{
printf（“%d\n”，clo）；
}
美国LEEP（500000）；
}
pthread_exit（NULL）；
}
int main（int argc，char*argv[]）
{
pthread_t threads[NUM_threads]；
int rc；
长t；
对于（t=0；t，当您的探查器运行时，GPU将从睡眠状态中退出（由于访问了从GPU查询数据的nvml
API）。这使它们对CUDA应用程序的响应速度更快，因此如果您对整个应用程序执行计时，应用程序的运行速度似乎“更快”（例如，使用linuxtime
命令）
一种解决方案是使用nvidia smi
命令将GPU置于“持久化模式”（使用nvidia smi--help
获取命令行帮助）
另一种解决方案是从应用程序内部进行计时，并从计时测量中排除CUDA启动时间，可能通过执行CUDA命令，例如cudaFree（0）
开始计时之前。
当GPU处于空闲或睡眠状态时，GPU可能需要大量时间准备处理工作。当您运行“探查器代码”时，您正在将GPU从睡眠状态中拉出来，因此您的基准测试可能运行得更快。您在这个问题中提供的数据太少，以至于很难猜测发生了什么，甚至您的观察结果是什么。您可以尝试将GPU设置为持久化模式，这应该与运行“探查器代码”具有类似的效果顺便说一下，你似乎没有接受任何关于你以前问题的答案。正如RobertCrovella所说，尝试将GPU设置为持久模式：因此，当没有活动客户端连接到GPU时，NVIDIA驱动程序保持加载，并且避免显著的GPU初始化开销。在Linux上，这可以通过执行<代码>英伟达实现。-smi-pm 1

（

将其停用）。您的GPU可能不支持此选项。Robert Crovella-使用nvidia smi将GPU设置为持久化模式是否需要根访问权限？我已接受您对前面问题的回答。我不知道存在这种情况。根据手册页（

man nvidia smi

）是的，它需要根访问权限。