CUDA评测-nvprof结果中API调用的含义是什么?

CUDA评测-nvprof结果中API调用的含义是什么?,cuda,Cuda,下面是我的nvprof结果,我试图理解API调用部分的意思。API调用中的第一个需要4.67456s,这比GPU活动中的第一个要长得多,这是为什么 ==25972== Profiling application: python view.py ==25972== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities:

下面是我的
nvprof
结果,我试图理解
API调用
部分的意思。
API调用中的第一个
需要4.67456s,这比
GPU活动中的第一个
要长得多,这是为什么

==25972== Profiling application: python view.py
==25972== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   98.62%  97.765ms     16999  5.7510us  2.6560us  11.744us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::nms_forward_kernel(float*, float const *, float, int, int)
                    1.09%  1.0835ms        90  12.039us     992ns  48.799us  [CUDA memcpy HtoD]
                    0.06%  58.240us         5  11.648us  11.392us  12.256us  void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int)
                    0.06%  56.352us         2  28.176us  26.720us  29.632us  void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>*, bool=0 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**>)
                    0.05%  52.672us         3  17.557us  16.576us  19.136us  void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>*, bool=1 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**>)
                    0.03%  27.136us         1  27.136us  27.136us  27.136us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_preprocess_kernel(float const *, float*, int, int*)
                    0.03%  26.527us         2  13.263us  13.216us  13.311us  void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=0*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
                    0.02%  19.744us         3  6.5810us  5.4720us  8.5120us  void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
                    0.02%  18.528us         2  9.2640us  9.0880us  9.4400us  [CUDA memcpy DtoH]
                    0.01%  8.2240us         1  8.2240us  8.2240us  8.2240us  _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_postprocess_kernel(float const *, float*, int, int*)
                    0.00%  3.7120us         1  3.7120us  3.7120us  3.7120us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
                    0.00%  3.3600us         1  3.3600us  3.3600us  3.3600us  void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
                    0.00%  2.9760us         1  2.9760us  2.9760us  2.9760us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
                    0.00%  2.5600us         1  2.5600us  2.5600us  2.5600us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(float*, thrust::cuda_cub::__transform::no_stencil_tag)
                    0.00%  2.3680us         1  2.3680us  2.3680us  2.3680us  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>)
      API calls:   69.38%  4.67456s         8  584.32ms  21.948us  4.66813s  cudaMalloc
                   19.85%  1.33738s         1  1.33738s  1.33738s  1.33738s  cudaDeviceReset
                    6.85%  461.19ms     16999  27.130us  4.3450us  2.3428ms  cudaStreamCreate
                    2.18%  146.78ms     17019  8.6240us  5.5850us  590.15us  cudaLaunchKernel
                    0.78%  52.472ms     16998  3.0860us  2.3880us  491.82us  cudaEventRecord
                    0.48%  32.347ms     16998  1.9030us  1.6020us  579.51us  cudaStreamWaitEvent
                    0.41%  27.471ms     16998  1.6160us  1.0150us  501.06us  cudaEventCreate
                    0.02%  1.0187ms        47  21.674us  8.9530us  82.099us  cudaMemcpyAsync
                    0.01%  859.57us        45  19.101us  6.6610us  60.919us  cudaMemcpy
                    0.01%  737.22us        47  15.685us  3.5030us  54.214us  cudaStreamSynchronize
                    0.01%  513.43us       278  1.8460us     427ns  69.612us  cuDeviceGetAttribute
                    0.01%  391.43us       430     910ns     571ns  12.840us  cudaGetDevice
                    0.01%  353.59us         3  117.86us  116.03us  120.19us  cuDeviceTotalMem
                    0.00%  258.63us         2  129.32us  128.63us  130.00us  cudaFree
                    0.00%  223.59us         2  111.79us  95.946us  127.64us  cudaGetDeviceProperties
                    0.00%  139.32us       147     947ns     715ns  7.0800us  cudaSetDevice
                    0.00%  130.12us       240     542ns     390ns  2.9830us  cudaGetDeviceCount
                    0.00%  113.01us         3  37.669us  23.669us  49.539us  cuDeviceGetName
                    0.00%  101.80us         1  101.80us  101.80us  101.80us  cudaDeviceSynchronize
                    0.00%  67.069us         2  33.534us  27.864us  39.205us  cudaLaunch
                    0.00%  22.799us         6  3.7990us  2.7200us  6.9700us  cudaFuncGetAttributes
                    0.00%  12.063us        12  1.0050us     822ns  1.9320us  cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
                    0.00%  11.027us        23     479ns     403ns     754ns  cudaPeekAtLastError
                    0.00%  5.5760us         5  1.1150us     493ns  2.9760us  cuDeviceGetCount
                    0.00%  4.6710us         2  2.3350us  1.3820us  3.2890us  cuInit
                    0.00%  4.6090us         6     768ns     683ns  1.0360us  cudaDeviceGetAttribute
                    0.00%  3.9340us         1  3.9340us  3.9340us  3.9340us  cuDeviceGetPCIBusId
                    0.00%  3.5570us         5     711ns     463ns  1.1720us  cudaSetupArgument
                    0.00%  3.0960us         4     774ns     446ns  1.2680us  cuDeviceGet
                    0.00%  3.0570us         2  1.5280us  1.2220us  1.8350us  cudaConfigureCall
                    0.00%  2.2150us         2  1.1070us     975ns  1.2400us  cuDriverGetVersion
                    0.00%     624ns         1     624ns     624ns     624ns  cudaGetLastError
                    0.00%     526ns         1     526ns     526ns     526ns  cuDeviceGetUuid
==25972==评测应用程序:python view.py
==25972==分析结果:
键入时间(%)时间调用平均最小最大名称
GPU活动:98.62%97.765ms 16999 5.7510us 2.6560us 11.744us全球N U 61 tmpxft U 00006356 U00000000 9 nms cuda U内核计算52 cpp1 U ii U 4795a1ea::nms U前向内核(浮点*,浮点常量*,浮点,整数,整数)
1.09%1.0835ms 90 12.039us 992ns 48.799us[CUDA memcpy HtoD]
0.06%58.240us 5 11.648us 11.392us 12.256us无效推力::cuda_立方::立方::RadixSortScanbins内核(int*,int)
0.06%56.352us 2 28.176us 26.720us 29.632us无效推力::cuda_cub::cub::DeviceradiXSortDownsweep内核(推力::cuda_cub::cub::deviceradix端口策略::policy 700 const*,推力::cuda_cub::cub::deviceradix端口下行内核*,bool=0 const*,推力::cuda_cub::cub::deviceradix端口下行内核**,bool=1*,推力::cuda_cub::deviceradix端口下行内核**,int,int,推力::cuda_cub::cub::cub::GridEvenShare)
0.05%52.672us 3 17.557us 16.576us 19.136us无效推力::cuda_cub::cub::DeviceradiXSortDownsweep内核(推力::cuda_cub::cub::DeviceRadixSortSweepPolicy::Policy700常量*,推力::cuda_cub::cub::DeviceRadixSortDownsweepKernel*,bool=1常量*,推力::cuda_cub::DeviceRadixSortDownsweepKernel*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.03%27.136us 127.136us 27.136us 27.136us全球N-61-tmpxft-U 00006356-U00000000-9-nms-cuda-U内核-U计算-U 52-cpp1-ii-U 4795a1ea::数据预处理内核(浮点常量*,浮点*,整数,整数*)
0.03%26.527us 2 13.263us 13.216us 13.311us无效推力::cuda_cub::cub::deviceradixsortupsweep内核(推力::cuda_cub::cub::DeviceRadixSortPolicy::policy 700 const*,bool=0*,推力::cuda_cub::DeviceRadixSortPolicy::policy 700 const*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.02%19.744us 3 6.5810us 5.4720us 8.5120us无效推力::cuda_cub::cub::deviceradixsortupsweekernel(推力::cuda_cub::cub::DeviceRadixSortPolicy::policy 700 const*,bool=1*,推力::cuda_cub::DeviceRadixSortPolicy::policy 700 const*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.02%18.528us 2 9.2640us 9.0880us 9.4400us[CUDA memcpy DtoH]
0.01%8.2240us 18.2240us 8.2240us 8.2240us全球tmpxft 00006356 00000000 nms cuda内核计算cpp1 ii 4795a1ea::数据后处理内核(浮点常量*、浮点*、整数、整数*)
0.00%3.7120us 1 3.7120us 3.7120us 3.7120us无效推力::cuda_cub::内核::内核代理(推力::设备ptr,内部)
0.00%3.3600us 1 3.3600us 3.3600us 3.3600us无效内核点应用1(偏移、浮点、浮点)
0.00%2.9760us 1 2.9760us 2.9760us 2.9760us无效推力::cuda_cub::核心::_内核_代理(int*,推力::cuda_cub::_转换::无模具标签)
0.00%2.5600us 1 2.5600us 2.5600us 2.5600us无效推力::cuda_cub::核心::_内核_代理(浮点*,推力::cuda_cub::_转换::无模具标签)
0.00%2.3680us 1 2.3680us 2.3680us 2.3680us无效推力::cuda_cub::核心::_内核_代理(推力::设备_ptr,推力::细节::正常迭代器)
API调用:69.38%4.67456s8584.32ms21.948us4.66813scudamalloc
19.85%1.33738s 1.33738s 1.33738s 1.33738s CUDADEVICESET
6.85%461.19ms 16999 27.130us 4.3450us 2.3428ms cudaStreamCreate
2.18%146.78ms 17019 8.6240us 5.5850us 590.15us cudaLaunchKernel
0.78%52.472ms 16998 3.0860us 2.3880us 491.82us cudaEventRecord
0.48%32.347ms 16998 1.9030us 1.6020us 579.51us CUDASTREAMWAIT事件
0.41%27.471ms 16998 1.6160us 1.0150us 501.06us cudaEventCreate
0.02%1.0187ms 47 21.674us 8.9530us 82.099us cudaMemcpyAsync
0.01%859.57us 45 19.101us 6.6610us 60.919us Cudamencpy
0.01%737.22us 47 15.685us 3.5030us 54.214us CUDA4
0.01%513.43us 278 1.8460us 427ns 69.612us cuDeviceGetAttribute
0.01%391.43us 430 910ns 571ns 12.840us CUDAGET装置
0.01%353.59us 3117.86us 116.03us 120.19us CuDeviceTotalem
0.00%258.63us 2 129.32us 128.63us 130.00us cudaFree
0.00%223.59us 2111.79us 95.946us 127.64us cudaGetDeviceProperties
0.00%139.32us 147 947ns 715ns 7.0800us CUDASED设备
0.00%130.12us 240 542ns 390ns 2.9830us cudaGetDeviceCount
0.00%113.01us 3 37.669us 23.669us 49.539us cuDeviceGetName
0.00%101.80us 1101.80us 101.80us 101.80us CUD设备同步
0.00%67.069us 2 33.534us 27.864us 39.205us cudaLaunch
0.00%22.799us 6 3.7990us 2.7200us 6.9700us CUDAFUNCGETTERATTIES
0.00%12.063us 12 1.0050us 822ns 1.9320us CUDAOccupPancyMaxActivieBlocksPermitor带标志多处理器
0.00%11.027us 23 479ns 403ns