CUDA评测-nvprof结果中API调用的含义是什么?
下面是我的CUDA评测-nvprof结果中API调用的含义是什么?,cuda,Cuda,下面是我的nvprof结果,我试图理解API调用部分的意思。API调用中的第一个需要4.67456s,这比GPU活动中的第一个要长得多,这是为什么 ==25972== Profiling application: python view.py ==25972== Profiling result: Type Time(%) Time Calls Avg Min Max Name GPU activities:
nvprof
结果,我试图理解API调用
部分的意思。API调用中的第一个
需要4.67456s,这比GPU活动中的第一个
要长得多,这是为什么
==25972== Profiling application: python view.py
==25972== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 98.62% 97.765ms 16999 5.7510us 2.6560us 11.744us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::nms_forward_kernel(float*, float const *, float, int, int)
1.09% 1.0835ms 90 12.039us 992ns 48.799us [CUDA memcpy HtoD]
0.06% 58.240us 5 11.648us 11.392us 12.256us void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int)
0.06% 56.352us 2 28.176us 26.720us 29.632us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>*, bool=0 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int, int>**>)
0.05% 52.672us 3 17.557us 16.576us 19.136us void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>*, bool=1 const *, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int, int>**>)
0.03% 27.136us 1 27.136us 27.136us 27.136us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_preprocess_kernel(float const *, float*, int, int*)
0.03% 26.527us 2 13.263us 13.216us 13.311us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=0, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=0*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 19.744us 3 6.5810us 5.4720us 8.5120us void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, bool=1, bool=1, float, int>(thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, bool=1*, thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *, int, int, thrust::cuda_cub::cub::GridEvenShare<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700 const *>)
0.02% 18.528us 2 9.2640us 9.0880us 9.4400us [CUDA memcpy DtoH]
0.01% 8.2240us 1 8.2240us 8.2240us 8.2240us _GLOBAL__N__61_tmpxft_00006356_00000000_9_nms_cuda_kernel_compute_52_cpp1_ii_4795a1ea::data_postprocess_kernel(float const *, float*, int, int*)
0.00% 3.7120us 1 3.7120us 3.7120us 3.7120us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<int>, int>, unsigned long>(thrust::device_ptr<int>, int)
0.00% 3.3600us 1 3.3600us 3.3600us 3.3600us void kernelPointwiseApply1<TensorFillOp<float>, float, unsigned int, int=1>(OffsetInfo<TensorFillOp<float>, float, unsigned int>, float, float)
0.00% 2.9760us 1 2.9760us 2.9760us 2.9760us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(int*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.5600us 1 2.5600us 2.5600us 2.5600us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(float*, thrust::cuda_cub::__transform::no_stencil_tag)
0.00% 2.3680us 1 2.3680us 2.3680us 2.3680us void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>, thrust::cuda_cub::__tabulate::functor<thrust::detail::normal_iterator<thrust::device_ptr<int>>, thrust::system::detail::generic::sequence_detail::sequence_functor<int>, long>, long>(thrust::device_ptr<int>, thrust::detail::normal_iterator<thrust::device_ptr<int>>)
API calls: 69.38% 4.67456s 8 584.32ms 21.948us 4.66813s cudaMalloc
19.85% 1.33738s 1 1.33738s 1.33738s 1.33738s cudaDeviceReset
6.85% 461.19ms 16999 27.130us 4.3450us 2.3428ms cudaStreamCreate
2.18% 146.78ms 17019 8.6240us 5.5850us 590.15us cudaLaunchKernel
0.78% 52.472ms 16998 3.0860us 2.3880us 491.82us cudaEventRecord
0.48% 32.347ms 16998 1.9030us 1.6020us 579.51us cudaStreamWaitEvent
0.41% 27.471ms 16998 1.6160us 1.0150us 501.06us cudaEventCreate
0.02% 1.0187ms 47 21.674us 8.9530us 82.099us cudaMemcpyAsync
0.01% 859.57us 45 19.101us 6.6610us 60.919us cudaMemcpy
0.01% 737.22us 47 15.685us 3.5030us 54.214us cudaStreamSynchronize
0.01% 513.43us 278 1.8460us 427ns 69.612us cuDeviceGetAttribute
0.01% 391.43us 430 910ns 571ns 12.840us cudaGetDevice
0.01% 353.59us 3 117.86us 116.03us 120.19us cuDeviceTotalMem
0.00% 258.63us 2 129.32us 128.63us 130.00us cudaFree
0.00% 223.59us 2 111.79us 95.946us 127.64us cudaGetDeviceProperties
0.00% 139.32us 147 947ns 715ns 7.0800us cudaSetDevice
0.00% 130.12us 240 542ns 390ns 2.9830us cudaGetDeviceCount
0.00% 113.01us 3 37.669us 23.669us 49.539us cuDeviceGetName
0.00% 101.80us 1 101.80us 101.80us 101.80us cudaDeviceSynchronize
0.00% 67.069us 2 33.534us 27.864us 39.205us cudaLaunch
0.00% 22.799us 6 3.7990us 2.7200us 6.9700us cudaFuncGetAttributes
0.00% 12.063us 12 1.0050us 822ns 1.9320us cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
0.00% 11.027us 23 479ns 403ns 754ns cudaPeekAtLastError
0.00% 5.5760us 5 1.1150us 493ns 2.9760us cuDeviceGetCount
0.00% 4.6710us 2 2.3350us 1.3820us 3.2890us cuInit
0.00% 4.6090us 6 768ns 683ns 1.0360us cudaDeviceGetAttribute
0.00% 3.9340us 1 3.9340us 3.9340us 3.9340us cuDeviceGetPCIBusId
0.00% 3.5570us 5 711ns 463ns 1.1720us cudaSetupArgument
0.00% 3.0960us 4 774ns 446ns 1.2680us cuDeviceGet
0.00% 3.0570us 2 1.5280us 1.2220us 1.8350us cudaConfigureCall
0.00% 2.2150us 2 1.1070us 975ns 1.2400us cuDriverGetVersion
0.00% 624ns 1 624ns 624ns 624ns cudaGetLastError
0.00% 526ns 1 526ns 526ns 526ns cuDeviceGetUuid
==25972==评测应用程序:python view.py
==25972==分析结果:
键入时间(%)时间调用平均最小最大名称
GPU活动:98.62%97.765ms 16999 5.7510us 2.6560us 11.744us全球N U 61 tmpxft U 00006356 U00000000 9 nms cuda U内核计算52 cpp1 U ii U 4795a1ea::nms U前向内核(浮点*,浮点常量*,浮点,整数,整数)
1.09%1.0835ms 90 12.039us 992ns 48.799us[CUDA memcpy HtoD]
0.06%58.240us 5 11.648us 11.392us 12.256us无效推力::cuda_立方::立方::RadixSortScanbins内核(int*,int)
0.06%56.352us 2 28.176us 26.720us 29.632us无效推力::cuda_cub::cub::DeviceradiXSortDownsweep内核(推力::cuda_cub::cub::deviceradix端口策略::policy 700 const*,推力::cuda_cub::cub::deviceradix端口下行内核*,bool=0 const*,推力::cuda_cub::cub::deviceradix端口下行内核**,bool=1*,推力::cuda_cub::deviceradix端口下行内核**,int,int,推力::cuda_cub::cub::cub::GridEvenShare)
0.05%52.672us 3 17.557us 16.576us 19.136us无效推力::cuda_cub::cub::DeviceradiXSortDownsweep内核(推力::cuda_cub::cub::DeviceRadixSortSweepPolicy::Policy700常量*,推力::cuda_cub::cub::DeviceRadixSortDownsweepKernel*,bool=1常量*,推力::cuda_cub::DeviceRadixSortDownsweepKernel*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.03%27.136us 127.136us 27.136us 27.136us全球N-61-tmpxft-U 00006356-U00000000-9-nms-cuda-U内核-U计算-U 52-cpp1-ii-U 4795a1ea::数据预处理内核(浮点常量*,浮点*,整数,整数*)
0.03%26.527us 2 13.263us 13.216us 13.311us无效推力::cuda_cub::cub::deviceradixsortupsweep内核(推力::cuda_cub::cub::DeviceRadixSortPolicy::policy 700 const*,bool=0*,推力::cuda_cub::DeviceRadixSortPolicy::policy 700 const*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.02%19.744us 3 6.5810us 5.4720us 8.5120us无效推力::cuda_cub::cub::deviceradixsortupsweekernel(推力::cuda_cub::cub::DeviceRadixSortPolicy::policy 700 const*,bool=1*,推力::cuda_cub::DeviceRadixSortPolicy::policy 700 const*,int,int,推力::cuda_cub::cub::GridEvenShare)
0.02%18.528us 2 9.2640us 9.0880us 9.4400us[CUDA memcpy DtoH]
0.01%8.2240us 18.2240us 8.2240us 8.2240us全球tmpxft 00006356 00000000 nms cuda内核计算cpp1 ii 4795a1ea::数据后处理内核(浮点常量*、浮点*、整数、整数*)
0.00%3.7120us 1 3.7120us 3.7120us 3.7120us无效推力::cuda_cub::内核::内核代理(推力::设备ptr,内部)
0.00%3.3600us 1 3.3600us 3.3600us 3.3600us无效内核点应用1(偏移、浮点、浮点)
0.00%2.9760us 1 2.9760us 2.9760us 2.9760us无效推力::cuda_cub::核心::_内核_代理(int*,推力::cuda_cub::_转换::无模具标签)
0.00%2.5600us 1 2.5600us 2.5600us 2.5600us无效推力::cuda_cub::核心::_内核_代理(浮点*,推力::cuda_cub::_转换::无模具标签)
0.00%2.3680us 1 2.3680us 2.3680us 2.3680us无效推力::cuda_cub::核心::_内核_代理(推力::设备_ptr,推力::细节::正常迭代器)
API调用:69.38%4.67456s8584.32ms21.948us4.66813scudamalloc
19.85%1.33738s 1.33738s 1.33738s 1.33738s CUDADEVICESET
6.85%461.19ms 16999 27.130us 4.3450us 2.3428ms cudaStreamCreate
2.18%146.78ms 17019 8.6240us 5.5850us 590.15us cudaLaunchKernel
0.78%52.472ms 16998 3.0860us 2.3880us 491.82us cudaEventRecord
0.48%32.347ms 16998 1.9030us 1.6020us 579.51us CUDASTREAMWAIT事件
0.41%27.471ms 16998 1.6160us 1.0150us 501.06us cudaEventCreate
0.02%1.0187ms 47 21.674us 8.9530us 82.099us cudaMemcpyAsync
0.01%859.57us 45 19.101us 6.6610us 60.919us Cudamencpy
0.01%737.22us 47 15.685us 3.5030us 54.214us CUDA4
0.01%513.43us 278 1.8460us 427ns 69.612us cuDeviceGetAttribute
0.01%391.43us 430 910ns 571ns 12.840us CUDAGET装置
0.01%353.59us 3117.86us 116.03us 120.19us CuDeviceTotalem
0.00%258.63us 2 129.32us 128.63us 130.00us cudaFree
0.00%223.59us 2111.79us 95.946us 127.64us cudaGetDeviceProperties
0.00%139.32us 147 947ns 715ns 7.0800us CUDASED设备
0.00%130.12us 240 542ns 390ns 2.9830us cudaGetDeviceCount
0.00%113.01us 3 37.669us 23.669us 49.539us cuDeviceGetName
0.00%101.80us 1101.80us 101.80us 101.80us CUD设备同步
0.00%67.069us 2 33.534us 27.864us 39.205us cudaLaunch
0.00%22.799us 6 3.7990us 2.7200us 6.9700us CUDAFUNCGETTERATTIES
0.00%12.063us 12 1.0050us 822ns 1.9320us CUDAOccupPancyMaxActivieBlocksPermitor带标志多处理器
0.00%11.027us 23 479ns 403ns