Cuda 为什么Hyper-Q在我的cc5.2硬件上选择性地重叠异步HtoD和DtoH传输？_Cuda

Cuda 为什么Hyper-Q在我的cc5.2硬件上选择性地重叠异步HtoD和DtoH传输？

cuda

Cuda 为什么Hyper-Q在我的cc5.2硬件上选择性地重叠异步HtoD和DtoH传输？,cuda,Cuda,有一个示例演示了如何使用streams和async memcpys在内核和memcpys之间以及HtoD和DtoH memcpys之间生成重叠。因此，我在GTX Titan X上运行了完整的异步示例，结果如下：如您所见，当HtoD、Kernel和DtoH在单个循环中背对背调用时，HtoD和DtoH传输之间没有任何重叠。然而，当在三个循环中分别调用它们时，HtoD和DtoH之间存在重叠如果Hyper-Q做了它声称要做的事情，那么在第一个版本的loop launching中也应该有HtoD和D

有一个示例演示了如何使用streams和async memcpys在内核和memcpys之间以及HtoD和DtoH memcpys之间生成重叠。因此，我在GTX Titan X上运行了完整的异步示例，结果如下：

如您所见，当HtoD、Kernel和DtoH在单个循环中背对背调用时，HtoD和DtoH传输之间没有任何重叠。然而，当在三个循环中分别调用它们时，HtoD和DtoH之间存在重叠

如果Hyper-Q做了它声称要做的事情，那么在第一个版本的loop launching中也应该有HtoD和DtoH重叠（就像特斯拉K20c一样）。我的理解是，在支持Hyper-Q的compute capability 3.5及以上版本的设备中，用户不必再担心定制启动顺序

我还运行了CUDA7.0

simpleHyperQ

示例。当

CUDA\u DEVICE\u MAX\u CONNECTIONS

设置为32时，我可以运行32个并发内核，因此Hyper-Q在这种情况下工作

我使用64位Windows 8.1、驱动程序版本353.06和CUDA 7.0，使用Visual Studio 2013编译，目标是x64平台发布模式，代码生成属性为

compute_52、sm_52

<代码>CUDA\u设备\u最大\u连接数设置为32

由于我无法发布更多链接，下面将发布异步示例的完整代码（稍加修改）

// Copyright 2012 NVIDIA Corporation

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at

//     http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand_kernel.h>

#include <stdio.h>

// Convenience function for checking CUDA runtime API results
// can be wrapped around any runtime API call. No-op in release builds.
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
    if (result != cudaSuccess) {
        fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
        assert(result == cudaSuccess);
    }
#endif
    return result;
}

__global__ void kernel(float *a, int offset)
{
    int i = offset + threadIdx.x + blockIdx.x*blockDim.x;
    float x = (float)i;
    float s = sinf(x);
    float c = cosf(x);
    a[i] = a[i] + sqrtf(s*s + c*c);
}

float maxError(float *a, int n)
{
    float maxE = 0;
    for (int i = 0; i < n; i++) {
        float error = fabs(a[i] - 1.0f);
        if (error > maxE) maxE = error;
    }
    return maxE;
}

int main(int argc, char **argv)
{
    _putenv_s("CUDA_DEVICE_MAX_CONNECTIONS", "32");

    const int blockSize = 256, nStreams = 4;
    const int n = 4 * 1024 * blockSize * nStreams;
    const int streamSize = n / nStreams;
    const int streamBytes = streamSize * sizeof(float);
    const int bytes = n * sizeof(float);

    int devId = 0;
    if (argc > 1) devId = atoi(argv[1]);

    cudaDeviceProp prop;
    checkCuda(cudaGetDeviceProperties(&prop, devId));
    printf("Device : %s\n", prop.name);
    checkCuda(cudaSetDevice(devId));

    // allocate pinned host memory and device memory
    float *a, *d_a;
    checkCuda(cudaMallocHost((void**)&a, bytes));      // host pinned
    checkCuda(cudaMalloc((void**)&d_a, bytes)); // device

    float ms; // elapsed time in milliseconds

    // create events and streams
    cudaEvent_t startEvent, stopEvent, dummyEvent;
    cudaStream_t stream[nStreams];
    checkCuda(cudaEventCreate(&startEvent));
    checkCuda(cudaEventCreate(&stopEvent));
    checkCuda(cudaEventCreate(&dummyEvent));
    for (int i = 0; i < nStreams; ++i)
        checkCuda(cudaStreamCreate(&stream[i]));

    // baseline case - sequential transfer and execute
    memset(a, 0, bytes);
    checkCuda(cudaEventRecord(startEvent, 0));
    checkCuda(cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice));
    kernel << <n / blockSize, blockSize >> >(d_a, 0);
    checkCuda(cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost));
    checkCuda(cudaEventRecord(stopEvent, 0));
    checkCuda(cudaEventSynchronize(stopEvent));
    checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
    printf("Time for sequential transfer and execute (ms): %f\n", ms);
    printf("  max error: %e\n", maxError(a, n));

    // asynchronous version 1: loop over {copy, kernel, copy}
    memset(a, 0, bytes);
    checkCuda(cudaEventRecord(startEvent, 0));
    for (int i = 0; i < nStreams; ++i) {
        int offset = i * streamSize;
        checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
            streamBytes, cudaMemcpyHostToDevice,
            stream[i]));
        kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
        checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
            streamBytes, cudaMemcpyDeviceToHost,
            stream[i]));
    }
    checkCuda(cudaEventRecord(stopEvent, 0));
    checkCuda(cudaEventSynchronize(stopEvent));
    checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
    printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms);
    printf("  max error: %e\n", maxError(a, n));

    // asynchronous version 2: 
    // loop over copy, loop over kernel, loop over copy
    memset(a, 0, bytes);
    checkCuda(cudaEventRecord(startEvent, 0));
    for (int i = 0; i < nStreams; ++i)
    {
        int offset = i * streamSize;
        checkCuda(cudaMemcpyAsync(&d_a[offset], &a[offset],
            streamBytes, cudaMemcpyHostToDevice,
            stream[i]));
    }
    for (int i = 0; i < nStreams; ++i)
    {
        int offset = i * streamSize;
        kernel << <streamSize / blockSize, blockSize, 0, stream[i] >> >(d_a, offset);
    }
    for (int i = 0; i < nStreams; ++i)
    {
        int offset = i * streamSize;
        checkCuda(cudaMemcpyAsync(&a[offset], &d_a[offset],
            streamBytes, cudaMemcpyDeviceToHost,
            stream[i]));
    }
    checkCuda(cudaEventRecord(stopEvent, 0));
    checkCuda(cudaEventSynchronize(stopEvent));
    checkCuda(cudaEventElapsedTime(&ms, startEvent, stopEvent));
    printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms);
    printf("  max error: %e\n", maxError(a, n));

    // cleanup
    checkCuda(cudaEventDestroy(startEvent));
    checkCuda(cudaEventDestroy(stopEvent));
    checkCuda(cudaEventDestroy(dummyEvent));
    for (int i = 0; i < nStreams; ++i)
        checkCuda(cudaStreamDestroy(stream[i]));
    cudaFree(d_a);
    cudaFreeHost(a);

    cudaDeviceReset();

    return 0;
}

//版权所有2012 NVIDIA公司
//根据Apache许可证2.0版（以下简称“许可证”）获得许可；
//除非遵守许可证，否则不得使用此文件。
//您可以通过以下方式获得许可证副本：
//     http://www.apache.org/licenses/LICENSE-2.0
//除非适用法律要求或书面同意，软件
//根据许可证进行的分发是按“原样”进行分发的，
//无任何明示或暗示的保证或条件。
//请参阅许可证以了解管理权限和权限的特定语言
//许可证下的限制。
#包括
#包括
#包括
#包括
//用于检查CUDA运行时API结果的便利函数
//可以围绕任何运行时API调用进行包装。发布版本中没有op。
内联
cudaError\u t checkCuda（cudaError\u t结果）
{
#如果已定义（调试）| |已定义（_调试）
如果（结果！=cudaSuccess）{
fprintf（stderr，“CUDA运行时错误：%s\n”，cudaGetErrorString（结果））；
断言（结果==cudaSuccess）；
}
#恩迪夫
返回结果；
}
__全局无效内核（浮点*a，整数偏移）
{
int i=偏移量+线程IDX.x+块IDX.x*块DIM.x；
浮点数x=（浮点数）i；
浮点数s=sinf（x）；
浮点数c=cosf（x）；
a[i]=a[i]+sqrtf（s*s+c*c）；
}
浮点最大错误（浮点*a，整数n）
{
浮点最大值=0；
对于（int i=0；i最大值）最大值=错误；
}
返回最大值；
}
int main（int argc，字符**argv）
{
_putenv_s（“CUDA_设备_最大_连接”，“32”）；
常数int blockSize=256，n流=4；
常量int n=4*1024*块大小*n流；
const int streamSize=n/n流；
const int streamBytes=streamSize*sizeof（float）；
const int bytes=n*sizeof（浮点）；
int-devId=0；
如果（argc>1）设备=atoi（argv[1]）；
cudaDeviceProp支柱；
检查CUDA（cudaGetDeviceProperties（&prop，devId））；
printf（“设备：%s\n”，属性名称）；
检查CUDA（cudaSetDevice（设备））；
//分配固定主机内存和设备内存
浮动*a，*d_a；
checkCuda（cudaMallocHost（（void**）和a，bytes））；//主机固定
checkCuda（cudamaloc（（void**）和d_a，bytes））；//设备
float ms；//以毫秒为单位的运行时间
//创建事件和流
cudaEvent\u t StarteEvent、stopEvent、DummeyEvent；
cudaStream_t stream[n流]；
选中CUDA（cudaEventCreate（&startEvent））；
选中CUDA（cudaEventCreate（&stopEvent））；
选中CUDA（cudaEventCreate（&dummyEvent））；
对于（int i=0；i（d_a，0）；
检查cuda（cudaMemcpy（a，d_a，bytes，cudaMemcpyDeviceToHost））；
检查CUDA（cudaEventRecord（停止事件，0））；
检查CUDA（cudaEventSynchronize（stopEvent））；
检查CUDA（CUDAEventReleasedTime（&ms、startEvent、stopEvent））；
printf（“顺序传输和执行的时间（毫秒）：%f\n”，毫秒）；
printf（“最大错误：%e\n”，最大错误（a，n））；
//异步版本1:在{copy，kernel，copy}上循环
memset（a，0，字节）；
检查CUDA（cudaEventRecord（StarteEvent，0））；
对于（int i=0；i（d_a，偏移量）；
检查CUDA（CUDAMEMCPIASYNC（&a[偏移]，&d_a[偏移]，
streamBytes，cudaMemcpyDeviceToHost，
溪流(i);；
}
检查CUDA（cudaEventRecord（停止事件，0））；
检查CUDA（cudaEventSynchronize（stopEvent））；
检查CUDA（CUDAEventReleasedTime（&ms、startEvent、stopEvent））；
printf（“异步V1传输和执行的时间（毫秒）：%f\n”，毫秒）；
printf（“最大错误：%e\n”，最大错误（a，n））；
//异步版本2：
//循环复制，循环内核，循环复制
memset（a，0，字节）；
检查CUDA（cudaEventRecord（StarteEvent，0））；
对于（int i=0；i（d_a，偏移量）；
}
对于（int i=0；i


[hbase]相关文章推荐



                                                        
HBase KeyValue.getKey（）返回额外字符
hbase 
手动删除HBase数据
hbase 
用于从HBase读取Titan顶点的分页
hbase 
java:-2:in`load'；：java.lang.UnsatifiedLinkError:在hbase中
hbase 
Hbase 设置和配置cloudera impala
hbasehive 
从客户端计算机连接到HBase
hbaseapache-zookeeper 
具有大量动态生成的列限定符的HBase性能（在列族中）
hbase 
如何为jboss fuse创建hbase捆绑包
hbase 
使用happybase将csv文件加载到hbase时出错
hbase 
Hbase是一个柱状数据库吗
hbase 
Hbase CallTimeoutException
hbase 
在hbase中添加多个列键和值筛选器
hbase 
HBase-Phoenix-Rowkey
hbase 
hbase当特定区域服务器出现故障时，从复制的群集读取数据
hbase 
HBase压缩如何影响客户端？
hbase 
无法从Kerberised HBase读取数据
hbase 
Hbase 如何在YAML中为gremlin序列化程序注册类-类未注册：org.janusgraph.graphdb.database.StandardJanusGraph
hbasegremlin 
Hbase群集复制|监视跨群集的复制延迟
hbase 
Hbase shell-查找不存在列的所有行
hbase 
将phoenix编译到HBase导致HBaseObserver停止工作
hbase 
                                       





随机文章推荐