Cuda UVA中的PCI-e事务
CUDA中统一虚拟寻址(UVA)中来自CPU-GPU的内存拷贝调用和来自CPU-GPU的内存拷贝调用在内部调度。但是,nvprof cuda探查器不会报告UVA的PCI-e总线事务。是否有任何方法可以了解从主机到设备以及从设备到主机的数据传输?是的,可以让Cuda UVA中的PCI-e事务,cuda,Cuda,CUDA中统一虚拟寻址(UVA)中来自CPU-GPU的内存拷贝调用和来自CPU-GPU的内存拷贝调用在内部调度。但是,nvprof cuda探查器不会报告UVA的PCI-e总线事务。是否有任何方法可以了解从主机到设备以及从设备到主机的数据传输?是的,可以让nvprof报告活动。您可能希望使用研究可用的选项 nvprof --help 如果将--print gpu trace和--unified memory profiling per process device选项组合在一起,则应该会得到一
nvprof
报告活动。您可能希望使用研究可用的选项
nvprof --help
如果将--print gpu trace
和--unified memory profiling per process device
选项组合在一起,则应该会得到一些指示UM活动的结果
以下是一个例子:
$ cat t476.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void mykernel(int *d_data){
printf("Data = %d\n", *d_data);
*d_data = 0;
}
int main(){
cudaDeviceProp myprop;
int mydevice;
int numdevices;
cudaGetDeviceCount(&numdevices);
cudaCheckErrors("get dev count fail");
for (mydevice = 0; mydevice < numdevices; mydevice++){
cudaGetDeviceProperties(&myprop, mydevice);
printf("device %d: %s\n", mydevice, myprop.name);
printf("device %d supports unified addressing: ", mydevice);
if (myprop.unifiedAddressing) printf(" yes\n");
else printf(" no\n");
printf("device %d supports managed memory: ", mydevice);
if (myprop.managedMemory) printf(" yes\n");
else printf(" no\n");
}
cudaSetDevice(--mydevice);
printf("using device %d\n", mydevice);
int h_data = 1;
int *d_data;
cudaMalloc(&d_data, sizeof(int));
cudaMemcpy(d_data, &h_data, sizeof(int), cudaMemcpyHostToDevice);
mykernel<<<1,1>>>(d_data);
cudaMemcpy(&h_data, d_data, sizeof(int), cudaMemcpyDeviceToHost);
printf("data = %d\n", h_data);
printf("now testing managed memory\n");
int *m_data;
cudaMallocManaged(&m_data, sizeof(int));
cudaCheckErrors("managed mem fail");
*m_data = 1;
mykernel<<<1,1>>>(m_data);
cudaDeviceSynchronize();
printf("data = %d\n", m_data);
cudaCheckErrors("some error");
return 0;
}
$ nvcc -arch=sm_35 -o t476 t476.cu
$ nvprof --print-gpu-trace --unified-memory-profiling per-process-device ./t476
==5114== NVPROF is profiling process 5114, command: ./t476
device 0: GeForce GT 640
device 0 supports unified addressing: yes
device 0 supports managed memory: yes
using device 0
Data = 1
data = 0
now testing managed memory
Data = 1
data = 0
==5114== Profiling application: ./t476
==5114== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Unified Memory Name
1.10622s 1.1200us - - - - - 4B 3.5714MB/s GeForce GT 640 1 7 - [CUDA memcpy HtoD]
1.10687s 64.481us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [102]
1.10693s 2.3360us - - - - - 4B 1.7123MB/s GeForce GT 640 1 7 - [CUDA memcpy DtoH]
1.12579s - - - - - - - - GeForce GT 640 - - 0 [Unified Memory CPU page faults]
1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy DtoH]
1.12579s - - - - - - - - GeForce GT 640 - - 0 B [Unified Memory Memcpy HtoD]
1.12590s 64.097us (1 1 1) (1 1 1) 32 0B 0B - - GeForce GT 640 1 7 - mykernel(int*) [108]
1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy DtoH]
1.12603s - - - - - - - - GeForce GT 640 - - 4096 B [Unified Memory Memcpy HtoD]
1.12603s - - - - - - - - GeForce GT 640 - - 1 [Unified Memory CPU page faults]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
$
$cat t476.cu
#包括
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
__全局无效mykernel(int*d_数据){
printf(“数据=%d\n”,*d_数据);
*d_数据=0;
}
int main(){
cudaDeviceProp myprop;
int-mydevice;
国际数字设备;
cudaGetDeviceCount(和numdevices);
cudaCheckErrors(“获取开发计数失败”);
对于(mydevice=0;mydevice
您所说的“PCI-e事务”是指cudaMemcpy
操作吗nvprof
即使启用了UVA,也肯定会报告这些问题。举个例子。请注意,(统一虚拟寻址)与(统一内存)不同。你是说嗯?谢谢你的回复。我想知道统一虚拟寻址(UVA)中CPU和GPU之间发生的PCI传输。UVA中是否有任何数据传输。nvprof不报告任何此类转移。您会注意到,我在您上面的问题下面问了一个问题以进行澄清。你是说与零拷贝相关的传输吗?UVA的转移尚不清楚。也许你