Cuda 为什么Hyper-Q在我的cc5.2硬件上选择性地重叠异步HtoD和DtoH传输?
有一个示例演示了如何使用streams和async memcpys在内核和memcpys之间以及HtoD和DtoH memcpys之间生成重叠。因此,我在GTX Titan X上运行了完整的异步示例,结果如下: 如您所见,当HtoD、Kernel和DtoH在单个循环中背对背调用时,HtoD和DtoH传输之间没有任何重叠。然而,当在三个循环中分别调用它们时,HtoD和DtoH之间存在重叠 如果Hyper-Q做了它声称要做的事情,那么在第一个版本的loop launching中也应该有HtoD和DtoH重叠(就像特斯拉K20c一样)。我的理解是,在支持Hyper-Q的compute capability 3.5及以上版本的设备中,用户不必再担心定制启动顺序 我还运行了CUDA7.0Cuda 为什么Hyper-Q在我的cc5.2硬件上选择性地重叠异步HtoD和DtoH传输?,cuda,Cuda,有一个示例演示了如何使用streams和async memcpys在内核和memcpys之间以及HtoD和DtoH memcpys之间生成重叠。因此,我在GTX Titan X上运行了完整的异步示例,结果如下: 如您所见,当HtoD、Kernel和DtoH在单个循环中背对背调用时,HtoD和DtoH传输之间没有任何重叠。然而,当在三个循环中分别调用它们时,HtoD和DtoH之间存在重叠 如果Hyper-Q做了它声称要做的事情,那么在第一个版本的loop launching中也应该有HtoD和D
simpleHyperQ
示例。当CUDA\u DEVICE\u MAX\u CONNECTIONS
设置为32时,我可以运行32个并发内核,因此Hyper-Q在这种情况下工作
我使用64位Windows 8.1、驱动程序版本353.06和CUDA 7.0,使用Visual Studio 2013编译,目标是x64平台发布模式,代码生成属性为compute_52、sm_52
<代码>CUDA\u设备\u最大\u连接数设置为32
由于我无法发布更多链接,下面将发布异步示例的完整代码(稍加修改)
// Copyright 2012 NVIDIA Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>
#include <stdio.h>
// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void kernel(float *a, int offset)
{
int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s*s + c*c);
}
float maxError(float *a, int n)
{
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i] - 1.0f);
if (error > maxE) maxE = error;
}
return maxE;
}
int main(int argc, char **argv)
{
_putenv_s("CUDA_DEVICE_MAX_CONNECTIONS", "32");
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
cudaDeviceProp prop;
checkCuda(cudaGetDeviceProperties(&prop, devId));
printf("Device : %s\n", prop.name);
checkCuda(cudaSetDevice(devId));
// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda(cudaMallocHost((void**)&a, bytes)); // host pinned
checkCuda(cudaMalloc((void**)&d_a, bytes)); // device
float ms; // elapsed time in milliseconds
// create events and streams
cudaEvent_t startEvent, stopEvent, dummyEvent;
cudaStream_t stream[nStreams];
checkCuda(cudaEventCreate(&startEvent));
checkCuda(cudaEventCreate(&stopEvent));
checkCuda(cudaEventCreate(&dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamCreate(&stream[i]));
// baseline case - sequential transfer and execute
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
checkCuda(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
kernel << <n / blockSize, blockSize >> >(d_a, 0);
checkCuda(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for sequential transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 1: loop over {copy, kernel, copy}
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i) {
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// asynchronous version 2:
// loop over copy, loop over kernel, loop over copy
memset(a, 0, bytes);
checkCuda(cudaEventRecord(startEvent, 0));
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
streamBytes, cudaMemcpyHostToDevice,
stream[i]));
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
}
for (int i = 0; i < nStreams; ++i)
{
int offset = i * streamSize;
checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
streamBytes, cudaMemcpyDeviceToHost,
stream[i]));
}
checkCuda(cudaEventRecord(stopEvent, 0));
checkCuda(cudaEventSynchronize(stopEvent));
checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
printf(" max error: %e\n", maxError(a, n));
// cleanup
checkCuda(cudaEventDestroy(startEvent));
checkCuda(cudaEventDestroy(stopEvent));
checkCuda(cudaEventDestroy(dummyEvent));
for (int i = 0; i < nStreams; ++i)
checkCuda(cudaStreamDestroy(stream[i]));
cudaFree(d_a);
cudaFreeHost(a);
cudaDeviceReset();
return 0;
}
//版权所有2012 NVIDIA公司
//根据Apache许可证2.0版(以下简称“许可证”)获得许可;
//除非遵守许可证,否则不得使用此文件。
//您可以通过以下方式获得许可证副本:
// http://www.apache.org/licenses/LICENSE-2.0
//除非适用法律要求或书面同意,软件
//根据许可证进行的分发是按“原样”进行分发的,
//无任何明示或暗示的保证或条件。
//请参阅许可证以了解管理权限和权限的特定语言
//许可证下的限制。
#包括
#包括
#包括
#包括
//用于检查CUDA运行时API结果的便利函数
//可以围绕任何运行时API调用进行包装。发布版本中没有op。
内联
cudaError\u t checkCuda(cudaError\u t结果)
{
#如果已定义(调试)| |已定义(_调试)
如果(结果!=cudaSuccess){
fprintf(stderr,“CUDA运行时错误:%s\n”,cudaGetErrorString(结果));
断言(结果==cudaSuccess);
}
#恩迪夫
返回结果;
}
__全局无效内核(浮点*a,整数偏移)
{
int i=偏移量+线程IDX.x+块IDX.x*块DIM.x;
浮点数x=(浮点数)i;
浮点数s=sinf(x);
浮点数c=cosf(x);
a[i]=a[i]+sqrtf(s*s+c*c);
}
浮点最大错误(浮点*a,整数n)
{
浮点最大值=0;
对于(int i=0;i最大值)最大值=错误;
}
返回最大值;
}
int main(int argc,字符**argv)
{
_putenv_s(“CUDA_设备_最大_连接”,“32”);
常数int blockSize=256,n流=4;
常量int n=4*1024*块大小*n流;
const int streamSize=n/n流;
const int streamBytes=streamSize*sizeof(float);
const int bytes=n*sizeof(浮点);
int-devId=0;
如果(argc>1)设备=atoi(argv[1]);
cudaDeviceProp支柱;
检查CUDA(cudaGetDeviceProperties(&prop,devId));
printf(“设备:%s\n”,属性名称);
检查CUDA(cudaSetDevice(设备));
//分配固定主机内存和设备内存
浮动*a,*d_a;
checkCuda(cudaMallocHost((void**)和a,bytes));//主机固定
checkCuda(cudamaloc((void**)和d_a,bytes));//设备
float ms;//以毫秒为单位的运行时间
//创建事件和流
cudaEvent\u t StarteEvent、stopEvent、DummeyEvent;
cudaStream_t stream[n流];
选中CUDA(cudaEventCreate(&startEvent));
选中CUDA(cudaEventCreate(&stopEvent));
选中CUDA(cudaEventCreate(&dummyEvent));
对于(int i=0;i(d_a,0);
检查cuda(cudaMemcpy(a,d_a,bytes,cudaMemcpyDeviceToHost));
检查CUDA(cudaEventRecord(停止事件,0));
检查CUDA(cudaEventSynchronize(stopEvent));
检查CUDA(CUDAEventReleasedTime(&ms、startEvent、stopEvent));
printf(“顺序传输和执行的时间(毫秒):%f\n”,毫秒);
printf(“最大错误:%e\n”,最大错误(a,n));
//异步版本1:在{copy,kernel,copy}上循环
memset(a,0,字节);
检查CUDA(cudaEventRecord(StarteEvent,0));
对于(int i=0;i(d_a,偏移量);
检查CUDA(CUDAMEMCPIASYNC(&a[偏移],&d_a[偏移],
streamBytes,cudaMemcpyDeviceToHost,
溪流(i);;
}
检查CUDA(cudaEventRecord(停止事件,0));
检查CUDA(cudaEventSynchronize(stopEvent));
检查CUDA(CUDAEventReleasedTime(&ms、startEvent、stopEvent));
printf(“异步V1传输和执行的时间(毫秒):%f\n”,毫秒);
printf(“最大错误:%e\n”,最大错误(a,n));
//异步版本2:
//循环复制,循环内核,循环复制
memset(a,0,字节);
检查CUDA(cudaEventRecord(StarteEvent,0));
对于(int i=0;i(d_a,偏移量);
}
对于(int i=0;i