Concurrency 推力执行策略将内核发布到默认流_Concurrency_Cuda_Thrust_Cuda Streams

Concurrency 推力执行策略将内核发布到默认流

concurrency cuda

Concurrency 推力执行策略将内核发布到默认流,concurrency,cuda,thrust,cuda-streams,Concurrency,Cuda,Thrust,Cuda Streams,我目前正在设计一个简短的教程，展示推力模板库的各个方面和功能不幸的是，我编写的代码中似乎存在一个问题，以便演示如何使用cuda流使用复制/计算并发性我的代码可以在这里的asynchronousLaunch目录中找到：以下是产生问题的代码摘要： //STL #include <cstdlib> #include <algorithm> #include <iostream> #include <vector> #include <fun

我目前正在设计一个简短的教程，展示推力模板库的各个方面和功能

不幸的是，我编写的代码中似乎存在一个问题，以便演示如何使用cuda流使用复制/计算并发性

我的代码可以在这里的asynchronousLaunch目录中找到：

以下是产生问题的代码摘要：

//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <vector>
#include <functional>

//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>

//Cuda
#include <cuda_runtime.h>

//Local
#include "AsynchronousLaunch.cu.h"

int main( int argc, char* argv[] )
{
    const size_t fullSize = 1024*1024*64;
    const size_t halfSize = fullSize/2;

    //Declare one host std::vector and initialize it with random values
    std::vector<float> hostVector( fullSize );
    std::generate(hostVector.begin(), hostVector.end(), normalRandomFunctor<float>(0.f,1.f) );

    //And two device vector of Half size
    thrust::device_vector<float> deviceVector0( halfSize );
    thrust::device_vector<float> deviceVector1( halfSize );

    //Declare  and initialize also two cuda stream
    cudaStream_t stream0, stream1;
    cudaStreamCreate( &stream0 );
    cudaStreamCreate( &stream1 );

    //Now, we would like to perform an alternate scheme copy/compute
    for( int i = 0; i < 10; i++ )
    {
        //Wait for the end of the copy to host before starting to copy back to device
        cudaStreamSynchronize(stream0);
        //Warning: thrust::copy does not handle asynchronous behaviour for host/device copy, you must use cudaMemcpyAsync to do so
        cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector0.data()), thrust::raw_pointer_cast(hostVector.data()), halfSize*sizeof(float), cudaMemcpyHostToDevice, stream0);
        cudaStreamSynchronize(stream1);
        //second copy is most likely to occur sequentially after the first one
        cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector1.data()), thrust::raw_pointer_cast(hostVector.data())+halfSize, halfSize*sizeof(float), cudaMemcpyHostToDevice, stream1);

        //Compute on device, here inclusive scan, for histogram equalization for instance
        thrust::transform( thrust::cuda::par.on(stream0), deviceVector0.begin(), deviceVector0.end(), deviceVector0.begin(), computeFunctor<float>() );
        thrust::transform( thrust::cuda::par.on(stream1), deviceVector1.begin(), deviceVector1.end(), deviceVector1.begin(), computeFunctor<float>() );

        //Copy back to host
        cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data()), thrust::raw_pointer_cast(deviceVector0.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream0);
        cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data())+halfSize, thrust::raw_pointer_cast(deviceVector1.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream1);
    }

    //Full Synchronize before exit
    cudaDeviceSynchronize();

    cudaStreamDestroy( stream0 );
    cudaStreamDestroy( stream1 );

    return EXIT_SUCCESS;
}

//STL
#包括
#包括
#包括
#包括
#包括
//推力
#包括
#包括
#包括
#包括
//库达
#包括
//本地的
#包括“AsynchronousLaunch.cu.h”
int main（int argc，char*argv[]）
{
const size\u t fullSize=1024*1024*64；
const size\u t halfSize=全尺寸/2；
//声明一个主机std:：vector并用随机值初始化它
标准：：矢量主机矢量（全尺寸）；
std:：generate（hostVector.begin（），hostVector.end（），normalRandomFunctor（0.f，1.f））；
//和两个一半大小的设备向量
推力：设备\矢量设备矢量0（半尺寸）；
推力：设备\矢量设备矢量1（半尺寸）；
//声明并初始化两个cuda流
cudaStream_t stream0，stream1；
cudaStreamCreate（&stream0）；
cudaStreamCreate（&stream1）；
//现在，我们要执行一个备用方案复制/计算
对于（int i=0；i<10；i++）
{
//等待复制到主机的结束，然后再开始复制回设备
cudaStreamSynchronize（stream0）；
//警告：推力：：复制不处理主机/设备复制的异步行为，必须使用cudaMemcpyAsync
cudaMemcpyAsync（推力：：原始指针\u转换（deviceVector0.data（）），推力：：原始指针\u转换（hostVector.data（）），半尺寸*sizeof（float），cudaMemcpyHostToDevice，stream0）；
cudaStreamSynchronize（stream1）；
//第二个副本最有可能在第一个副本之后依次出现
CUDAMCPyaSync（推力：：原始指针\u转换（deviceVector1.data（）），推力：：原始指针\u转换（hostVector.data（））+半尺寸，半尺寸*大小（浮点），CUDAMCPyHostToDevice，stream1）；
//在设备上计算，这里包括扫描，例如直方图均衡
推力：：变换（推力：：cuda：：par.on（stream0）、deviceVector0.begin（）、deviceVector0.end（）、deviceVector0.begin（）、computeFunctor（））；
推力：：转换（推力：：cuda：：par.on（stream1），deviceVector1.begin（），deviceVector1.end（），deviceVector1.begin（），computeFunctor（））；
//复制回主机
cudaMemcpyAsync（推力：：原始指针转换（hostVector.data（）），推力：：原始指针转换（deviceVector0.data（）），半尺寸*大小（float），cudaMemcpyDeviceToHost，stream0）；
cudaMemcpyAsync（推力：：原始指针转换（hostVector.data（））+半尺寸，推力：：原始指针转换（deviceVector1.data（）），半尺寸*sizeof（float），cudaMemcpyDeviceToHost，stream1）；
}
//退出前完全同步
cudaDeviceSynchronize（）；
cudaStreamDestroy（stream0）；
cudaStreamDestroy（stream1）；
返回退出成功；
}

以下是通过nvidia visual profile观察到的一个程序实例的结果：

正如你所看到的，cudamemcopy（棕色）都被发布到流13和流14，但是由来自推力：：transform的推力生成的内核被发布到默认流（在捕获中为蓝色）

顺便说一下，我使用的是cuda工具包7.0.28版，带有GTX680和gcc 4.8.2

如果有人能告诉我我的代码有什么问题，我将不胜感激

先谢谢你

<>编辑：这里是我认为是解决方案的代码：

//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <functional>
#include <vector>


//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>


//Cuda
#include <cuda_runtime.h>

//Local definitions

template<typename T>
struct computeFunctor
{
    __host__ __device__
    computeFunctor() {}

    __host__ __device__
    T operator()( T in )
    {
        //Naive functor that generates expensive but useless instructions
        T a =  cos(in);
        for(int i = 0; i < 350; i++ )
        {
            a+=cos(in);
        }
        return a;
    }
};

int main( int argc, char* argv[] )
{
    const size_t fullSize =  1024*1024*2;
    const size_t nbOfStrip = 4;
    const size_t stripSize =  fullSize/nbOfStrip;

    //Allocate host pinned memory in order to use asynchronous api and initialize it with random values
    float* hostVector;
    cudaMallocHost(&hostVector,fullSize*sizeof(float));
    std::fill(hostVector, hostVector+fullSize, 1.0f );

    //And one device vector of the same size
    thrust::device_vector<float> deviceVector( fullSize );

    //Declare  and initialize also two cuda stream
    std::vector<cudaStream_t> vStream(nbOfStrip);
    for( auto it = vStream.begin(); it != vStream.end(); it++ )
    {
        cudaStreamCreate( &(*it) );
    }

    //Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice/Compute/CopyToHost for each stream scheme:
    for( int i = 0; i < 5; i++ )
    {
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            size_t nextOffset = stripSize*(j+1);
            cudaStreamSynchronize(vStream.at(j));
            cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
            thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
            cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
        }
    }
    //On devices that do not possess multiple queues copy engine capability, this solution serializes all command even if they have been issued to different streams
    //Why ? Because in the point of view of the copy engine, which is a single ressource in this case, there is a time dependency between HtoD(n) and DtoH(n) which is ok, but there is also
    // a false dependency between DtoH(n) and HtoD(n+1), that preclude any copy/compute overlap

    //Full Synchronize before testing second solution
    cudaDeviceSynchronize();

    //Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice for each stream /Compute for each stream /CopyToHost for each stream scheme:
    for( int i = 0; i < 5; i++ )
    {
        for( int j=0; j!=nbOfStrip; j++)
        {
            cudaStreamSynchronize(vStream.at(j));
        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            size_t nextOffset = stripSize*(j+1);
            thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );

        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
        }
    }
    //On device that do not possess multiple queues in the copy engine, this solution yield better results, on other, it should show nearly identic results

    //Full Synchronize before exit
    cudaDeviceSynchronize();

    for( auto it = vStream.begin(); it != vStream.end(); it++ )
    {
        cudaStreamDestroy( *it );
    }
    cudaFreeHost( hostVector );

    return EXIT_SUCCESS;
}

//STL
#包括
#包括
#包括
#包括
#包括
//推力
#包括
#包括
#包括
//库达
#包括
//本地定义
模板
结构计算函数
{
__主机设备__
computeFunctor（）{}
__主机设备__
T运算符（）（T in）
{
//生成昂贵但无用指令的朴素函子
T a=cos（in）；
对于（int i=0；i<350；i++）
{
a+=cos（in）；
}
返回a；
}
};
int main（int argc，char*argv[]）
{
const size\u t fullSize=1024*1024*2；
常数大小=4；
const size_t stripSize=全尺寸/nbOfStrip；
//分配主机固定内存以使用异步api并使用随机值初始化它
浮点*向量；
cudaMallocHost（&hostVector，全尺寸*sizeof（浮动））；
std:：fill（hostVector，hostVector+fullSize，1.0f）；
//和一个相同大小的设备向量
推力：设备矢量设备矢量（全尺寸）；
//声明并初始化两个cuda流
std：：矢量vStream（nbOfStrip）；
for（auto it=vStream.begin（）；it！=vStream.end（）；it++）
{
cudaStreamCreate（&（*it））；
}
//现在，我们想在循环中为每个流方案使用copyToDevice/compute/CopyToHost执行备用方案复制/计算：
对于（int i=0；i<5；i++）
{
对于（int j=0；j！=nbOfStrip；j++）
{
尺寸\u t偏移=条带尺寸*j；
尺寸\u t nextofset=条带尺寸*（j+1）；
cudaStreamSynchronize（vStream.at（j））；
cudaMemcpyAsync（推力：原始指针转换（deviceVector.data（））+偏移量，主机向量+偏移量，stripSize*sizeof（float），cudaMemcpyHostToDevice，vStream.at（j））；
推力：：变换（推力：：cuda：：par.on（vStream.at（j）），deviceVector.begin（）+偏移量，deviceVector.begin（）+下一个函数集，deviceVector.begin（）+偏移量，computeFunctor（））；
cudaMemcpyAsync（hostVector+offset，推力：：原始指针转换（deviceVector.data（））+offset，stripSize*sizeof（float），cudaMemcpyDeviceToHost，vStream.at（j））；
}
}
//在不具备多队列复制引擎功能的设备上，此解决方案序列化所有命令，即使这些命令已发送到不同的流
//为什么？因为从复制引擎的角度来看，在本例中，复制引擎是一个单一的ressource，HtoD（n）和ressource之间存在时间依赖关系
in order to show how to use copy/compute concurrency using cuda streams