Warning: file_get_contents(/data/phpspider/zhask/data//catemap/1/visual-studio-2008/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Concurrency 推力执行策略将内核发布到默认流_Concurrency_Cuda_Thrust_Cuda Streams - Fatal编程技术网

Concurrency 推力执行策略将内核发布到默认流

Concurrency 推力执行策略将内核发布到默认流,concurrency,cuda,thrust,cuda-streams,Concurrency,Cuda,Thrust,Cuda Streams,我目前正在设计一个简短的教程,展示推力模板库的各个方面和功能 不幸的是,我编写的代码中似乎存在一个问题,以便演示如何使用cuda流使用复制/计算并发性 我的代码可以在这里的asynchronousLaunch目录中找到: 以下是产生问题的代码摘要: //STL #include <cstdlib> #include <algorithm> #include <iostream> #include <vector> #include <fun

我目前正在设计一个简短的教程,展示推力模板库的各个方面和功能

不幸的是,我编写的代码中似乎存在一个问题,以便演示如何使用cuda流使用复制/计算并发性

我的代码可以在这里的asynchronousLaunch目录中找到:

以下是产生问题的代码摘要:

//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <vector>
#include <functional>

//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/scan.h>

//Cuda
#include <cuda_runtime.h>

//Local
#include "AsynchronousLaunch.cu.h"

int main( int argc, char* argv[] )
{
    const size_t fullSize = 1024*1024*64;
    const size_t halfSize = fullSize/2;

    //Declare one host std::vector and initialize it with random values
    std::vector<float> hostVector( fullSize );
    std::generate(hostVector.begin(), hostVector.end(), normalRandomFunctor<float>(0.f,1.f) );

    //And two device vector of Half size
    thrust::device_vector<float> deviceVector0( halfSize );
    thrust::device_vector<float> deviceVector1( halfSize );

    //Declare  and initialize also two cuda stream
    cudaStream_t stream0, stream1;
    cudaStreamCreate( &stream0 );
    cudaStreamCreate( &stream1 );

    //Now, we would like to perform an alternate scheme copy/compute
    for( int i = 0; i < 10; i++ )
    {
        //Wait for the end of the copy to host before starting to copy back to device
        cudaStreamSynchronize(stream0);
        //Warning: thrust::copy does not handle asynchronous behaviour for host/device copy, you must use cudaMemcpyAsync to do so
        cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector0.data()), thrust::raw_pointer_cast(hostVector.data()), halfSize*sizeof(float), cudaMemcpyHostToDevice, stream0);
        cudaStreamSynchronize(stream1);
        //second copy is most likely to occur sequentially after the first one
        cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector1.data()), thrust::raw_pointer_cast(hostVector.data())+halfSize, halfSize*sizeof(float), cudaMemcpyHostToDevice, stream1);

        //Compute on device, here inclusive scan, for histogram equalization for instance
        thrust::transform( thrust::cuda::par.on(stream0), deviceVector0.begin(), deviceVector0.end(), deviceVector0.begin(), computeFunctor<float>() );
        thrust::transform( thrust::cuda::par.on(stream1), deviceVector1.begin(), deviceVector1.end(), deviceVector1.begin(), computeFunctor<float>() );

        //Copy back to host
        cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data()), thrust::raw_pointer_cast(deviceVector0.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream0);
        cudaMemcpyAsync(thrust::raw_pointer_cast(hostVector.data())+halfSize, thrust::raw_pointer_cast(deviceVector1.data()), halfSize*sizeof(float), cudaMemcpyDeviceToHost, stream1);
    }

    //Full Synchronize before exit
    cudaDeviceSynchronize();

    cudaStreamDestroy( stream0 );
    cudaStreamDestroy( stream1 );

    return EXIT_SUCCESS;
}
//STL
#包括
#包括
#包括
#包括
#包括
//推力
#包括
#包括
#包括
#包括
//库达
#包括
//本地的
#包括“AsynchronousLaunch.cu.h”
int main(int argc,char*argv[])
{
const size\u t fullSize=1024*1024*64;
const size\u t halfSize=全尺寸/2;
//声明一个主机std::vector并用随机值初始化它
标准::矢量主机矢量(全尺寸);
std::generate(hostVector.begin(),hostVector.end(),normalRandomFunctor(0.f,1.f));
//和两个一半大小的设备向量
推力:设备\矢量设备矢量0(半尺寸);
推力:设备\矢量设备矢量1(半尺寸);
//声明并初始化两个cuda流
cudaStream_t stream0,stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
//现在,我们要执行一个备用方案复制/计算
对于(int i=0;i<10;i++)
{
//等待复制到主机的结束,然后再开始复制回设备
cudaStreamSynchronize(stream0);
//警告:推力::复制不处理主机/设备复制的异步行为,必须使用cudaMemcpyAsync
cudaMemcpyAsync(推力::原始指针\u转换(deviceVector0.data()),推力::原始指针\u转换(hostVector.data()),半尺寸*sizeof(float),cudaMemcpyHostToDevice,stream0);
cudaStreamSynchronize(stream1);
//第二个副本最有可能在第一个副本之后依次出现
CUDAMCPyaSync(推力::原始指针\u转换(deviceVector1.data()),推力::原始指针\u转换(hostVector.data())+半尺寸,半尺寸*大小(浮点),CUDAMCPyHostToDevice,stream1);
//在设备上计算,这里包括扫描,例如直方图均衡
推力::变换(推力::cuda::par.on(stream0)、deviceVector0.begin()、deviceVector0.end()、deviceVector0.begin()、computeFunctor());
推力::转换(推力::cuda::par.on(stream1),deviceVector1.begin(),deviceVector1.end(),deviceVector1.begin(),computeFunctor());
//复制回主机
cudaMemcpyAsync(推力::原始指针转换(hostVector.data()),推力::原始指针转换(deviceVector0.data()),半尺寸*大小(float),cudaMemcpyDeviceToHost,stream0);
cudaMemcpyAsync(推力::原始指针转换(hostVector.data())+半尺寸,推力::原始指针转换(deviceVector1.data()),半尺寸*sizeof(float),cudaMemcpyDeviceToHost,stream1);
}
//退出前完全同步
cudaDeviceSynchronize();
cudaStreamDestroy(stream0);
cudaStreamDestroy(stream1);
返回退出成功;
}
以下是通过nvidia visual profile观察到的一个程序实例的结果:

正如你所看到的,cudamemcopy(棕色)都被发布到流13和流14,但是由来自推力::transform的推力生成的内核被发布到默认流(在捕获中为蓝色)

顺便说一下,我使用的是cuda工具包7.0.28版,带有GTX680和gcc 4.8.2

如果有人能告诉我我的代码有什么问题,我将不胜感激

先谢谢你

<>编辑:这里是我认为是解决方案的代码:

//STL
#include <cstdlib>
#include <algorithm>
#include <iostream>
#include <functional>
#include <vector>


//Thrust
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/execution_policy.h>


//Cuda
#include <cuda_runtime.h>

//Local definitions

template<typename T>
struct computeFunctor
{
    __host__ __device__
    computeFunctor() {}

    __host__ __device__
    T operator()( T in )
    {
        //Naive functor that generates expensive but useless instructions
        T a =  cos(in);
        for(int i = 0; i < 350; i++ )
        {
            a+=cos(in);
        }
        return a;
    }
};

int main( int argc, char* argv[] )
{
    const size_t fullSize =  1024*1024*2;
    const size_t nbOfStrip = 4;
    const size_t stripSize =  fullSize/nbOfStrip;

    //Allocate host pinned memory in order to use asynchronous api and initialize it with random values
    float* hostVector;
    cudaMallocHost(&hostVector,fullSize*sizeof(float));
    std::fill(hostVector, hostVector+fullSize, 1.0f );

    //And one device vector of the same size
    thrust::device_vector<float> deviceVector( fullSize );

    //Declare  and initialize also two cuda stream
    std::vector<cudaStream_t> vStream(nbOfStrip);
    for( auto it = vStream.begin(); it != vStream.end(); it++ )
    {
        cudaStreamCreate( &(*it) );
    }

    //Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice/Compute/CopyToHost for each stream scheme:
    for( int i = 0; i < 5; i++ )
    {
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            size_t nextOffset = stripSize*(j+1);
            cudaStreamSynchronize(vStream.at(j));
            cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
            thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );
            cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
        }
    }
    //On devices that do not possess multiple queues copy engine capability, this solution serializes all command even if they have been issued to different streams
    //Why ? Because in the point of view of the copy engine, which is a single ressource in this case, there is a time dependency between HtoD(n) and DtoH(n) which is ok, but there is also
    // a false dependency between DtoH(n) and HtoD(n+1), that preclude any copy/compute overlap

    //Full Synchronize before testing second solution
    cudaDeviceSynchronize();

    //Now, we would like to perform an alternate scheme copy/compute in a loop using the copyToDevice for each stream /Compute for each stream /CopyToHost for each stream scheme:
    for( int i = 0; i < 5; i++ )
    {
        for( int j=0; j!=nbOfStrip; j++)
        {
            cudaStreamSynchronize(vStream.at(j));
        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            cudaMemcpyAsync(thrust::raw_pointer_cast(deviceVector.data())+offset, hostVector+offset, stripSize*sizeof(float), cudaMemcpyHostToDevice, vStream.at(j));
        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            size_t nextOffset = stripSize*(j+1);
            thrust::transform( thrust::cuda::par.on(vStream.at(j)), deviceVector.begin()+offset, deviceVector.begin()+nextOffset, deviceVector.begin()+offset, computeFunctor<float>() );

        }
        for( int j=0; j!=nbOfStrip; j++)
        {
            size_t offset = stripSize*j;
            cudaMemcpyAsync(hostVector+offset, thrust::raw_pointer_cast(deviceVector.data())+offset, stripSize*sizeof(float), cudaMemcpyDeviceToHost, vStream.at(j));
        }
    }
    //On device that do not possess multiple queues in the copy engine, this solution yield better results, on other, it should show nearly identic results

    //Full Synchronize before exit
    cudaDeviceSynchronize();

    for( auto it = vStream.begin(); it != vStream.end(); it++ )
    {
        cudaStreamDestroy( *it );
    }
    cudaFreeHost( hostVector );

    return EXIT_SUCCESS;
}
//STL
#包括
#包括
#包括
#包括
#包括
//推力
#包括
#包括
#包括
//库达
#包括
//本地定义
模板
结构计算函数
{
__主机设备__
computeFunctor(){}
__主机设备__
T运算符()(T in)
{
//生成昂贵但无用指令的朴素函子
T a=cos(in);
对于(int i=0;i<350;i++)
{
a+=cos(in);
}
返回a;
}
};
int main(int argc,char*argv[])
{
const size\u t fullSize=1024*1024*2;
常数大小=4;
const size_t stripSize=全尺寸/nbOfStrip;
//分配主机固定内存以使用异步api并使用随机值初始化它
浮点*向量;
cudaMallocHost(&hostVector,全尺寸*sizeof(浮动));
std::fill(hostVector,hostVector+fullSize,1.0f);
//和一个相同大小的设备向量
推力:设备矢量设备矢量(全尺寸);
//声明并初始化两个cuda流
std::矢量vStream(nbOfStrip);
for(auto it=vStream.begin();it!=vStream.end();it++)
{
cudaStreamCreate(&(*it));
}
//现在,我们想在循环中为每个流方案使用copyToDevice/compute/CopyToHost执行备用方案复制/计算:
对于(int i=0;i<5;i++)
{
对于(int j=0;j!=nbOfStrip;j++)
{
尺寸\u t偏移=条带尺寸*j;
尺寸\u t nextofset=条带尺寸*(j+1);
cudaStreamSynchronize(vStream.at(j));
cudaMemcpyAsync(推力:原始指针转换(deviceVector.data())+偏移量,主机向量+偏移量,stripSize*sizeof(float),cudaMemcpyHostToDevice,vStream.at(j));
推力::变换(推力::cuda::par.on(vStream.at(j)),deviceVector.begin()+偏移量,deviceVector.begin()+下一个函数集,deviceVector.begin()+偏移量,computeFunctor());
cudaMemcpyAsync(hostVector+offset,推力::原始指针转换(deviceVector.data())+offset,stripSize*sizeof(float),cudaMemcpyDeviceToHost,vStream.at(j));
}
}
//在不具备多队列复制引擎功能的设备上,此解决方案序列化所有命令,即使这些命令已发送到不同的流
//为什么?因为从复制引擎的角度来看,在本例中,复制引擎是一个单一的ressource,HtoD(n)和ressource之间存在时间依赖关系
in order to show how to use copy/compute concurrency using cuda streams