Cuda 费米体系结构的错误依赖性问题
我正在尝试使用Cuda 费米体系结构的错误依赖性问题,cuda,nsight,Cuda,Nsight,我正在尝试使用3流实现“3-方式重叠”,如中的示例所示。但我没能做到 我有Geforce GT 550M(带有一个复制引擎的费米体系结构),我使用的是Windows 7(64位) 这是我写的代码 #include <iostream> #include "cuda_runtime.h" #include "device_launch_parameters.h" // includes, project #include "helper_cuda.h" #include "help
3
流实现“3
-方式重叠”,如中的示例所示。但我没能做到
我有Geforce GT 550M(带有一个复制引擎的费米体系结构),我使用的是Windows 7(64位)
这是我写的代码
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
// overlapped execution using version 2
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
}
//cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
//kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
}
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
//包括,项目
#包括“helper_cuda.h”
#包括“helper\u functions.h”//helper实用程序函数
#包括
使用名称空间std;
#定义数据大小为6000000
#定义NUM_线程32
#定义NUM_块16
#定义NUM_流3
__全局无效内核(常量int*in、int*out、int数据大小)
{
int start=blockIdx.x*blockDim.x+threadIdx.x;
int end=dataSize;
对于(int i=start;i cout从上面的评论来看,OP的问题似乎是一个错误依赖性问题,费米体系结构受到影响,开普勒体系结构的Hyper-Q特性解决了这个问题
总而言之,OP强调了这样一个事实:第一次D2H传输(流1)不会在最后一次H2D(流3)完成后立即开始,而原则上可以。下图中的红色圆圈突出显示了时间间隔(此后,但对于不同的规定,所有试验均指属于费米系列的GeForce GT540M):
OP的方法是广度优先方法,根据以下方案操作:
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyHostToDevice, streams[i]);
for(int i = 0; i < NUM_STREAMS; i++)
{
kernel_launch_1<<<..., 0, streams[i]>>>(...);
kernel_launch_2<<<..., 0, streams[i]>>>(...);
}
for(int i = 0; i < NUM_STREAMS; i++)
cudaMemcpyAsync(..., cudaMemcpyDeviceToHost, streams[i]);
for(int i=0;i
使用深度优先的方法,按照以下方案操作
for(int i = 0; i < NUM_STREAMS; i++)
{
cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);
kernel_launch_1<<<...., 0, streams[i]>>>(....);
kernel_launch_2<<<...., 0, streams[i]>>>(....);
cudaMemcpyAsync(...., cudaMemcpyDeviceToHost, streams[i]);
}
for(int i=0;i
根据以下时间线(深度优先代码报告在答案底部),似乎没有改善情况,但似乎显示了更糟糕的重叠:
在广度优先的方法下,在评论第二次内核启动时,第一个D2H拷贝会立即启动,如下时间线所示:
最后,在开普勒K20c上运行代码时,问题不会出现,如下图所示:
以下是深度优先方法的代码:
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions
#include <stdio.h>
using namespace std;
#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3
__global__ void kernel(const int *in, int *out, int dataSize)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int end = dataSize;
for (int i = start; i < end; i += blockDim.x * gridDim.x)
{
out[i] = in[i] * in[i];
}
}
int main()
{
const int dataSize = DATA_SIZE;
int *h_in = new int[dataSize];
int *h_out = new int[dataSize];
int *h_groundTruth = new int[dataSize];
// Input population
for(int i = 0; i < dataSize; i++)
h_in[i] = 5;
for(int i = 0; i < dataSize; i++)
h_out[i] = 0;
// CPU calculation for ground truth
for(int i = 0; i < dataSize; i++)
h_groundTruth[i] = h_in[i] * h_in[i];
// Choose which GPU to run on, change this on a multi-GPU system.
checkCudaErrors( cudaSetDevice(0) );
int *d_in = 0;
int *d_out = 0;
int streamSize = dataSize / NUM_STREAMS;
size_t memSize = dataSize * sizeof(int);
size_t streamMemSize = memSize / NUM_STREAMS;
checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );
// registers host memory as page-locked (required for asynch cudaMemcpyAsync)
checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));
// set kernel launch config
dim3 nThreads = dim3(NUM_THREADS,1,1);
dim3 nBlocks = dim3(NUM_BLOCKS,1,1);
cout << "GPU Kernel Configuration : " << endl;
cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;
// create cuda stream
cudaStream_t streams[NUM_STREAMS];
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamCreate(&streams[i]));
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
cudaEventRecord(start, 0);
for(int i = 0; i < NUM_STREAMS; i++)
{
int offset = i * streamSize;
cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice, streams[i]);
dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset], streamSize/2);
kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2], &d_out[offset + streamSize/2], streamSize/2);
cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost, streams[i]);
}
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamSynchronize(streams[i]));
cudaEventRecord(stop, 0);
checkCudaErrors(cudaStreamSynchronize(0));
checkCudaErrors(cudaDeviceSynchronize());
float gpu_time = 0;
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
// release resources
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaHostUnregister(h_in));
checkCudaErrors(cudaHostUnregister(h_out));
checkCudaErrors(cudaFree(d_in));
checkCudaErrors(cudaFree(d_out));
for(int i = 0; i < NUM_STREAMS; i++)
checkCudaErrors(cudaStreamDestroy(streams[i]));
cudaDeviceReset();
cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;
// GPU output check
int sum = 0;
for(int i = 0; i < dataSize; i++)
sum += h_groundTruth[i] - h_out[i];
cout << "Error between CPU and GPU: " << sum << endl;
delete[] h_in;
delete[] h_out;
delete[] h_groundTruth;
return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
//包括,项目
#包括“helper_cuda.h”
#包括“helper\u functions.h”//helper实用程序函数
#包括
使用名称空间std;
#定义数据大小为6000000
#定义NUM_线程32
#定义NUM_块16
#定义NUM_流3
__全局无效内核(常量int*in、int*out、int数据大小)
{
int start=blockIdx.x*blockDim.x+threadIdx.x;
int end=dataSize;
对于(int i=start;i 我是否认为这是一个错误的依赖性问题,另请参见。您能否具体说明您所说的3向重叠是什么意思?您是否试图看到D2H memcpy、计算内核和H2D memc