CUDA:较慢的推力::在调用推力::for\u之后减少

CUDA:较慢的推力::在调用推力::for\u之后减少,cuda,gpgpu,thrust,Cuda,Gpgpu,Thrust,我正在尝试使用推力和GK107[GeForce GTX 650]获取一个数字的总和。在内存上初始化设备向量之后,推力::减少的执行时间显著增加,这让我感到困惑 以下是示例代码: #include <iostream> #include <stack> #include <ctime> #include <thrust/device_vector.h> #include <thrust/iterator/counting_iterator.h

我正在尝试使用
推力
GK107[GeForce GTX 650]
获取一个数字的总和。在内存上初始化
设备向量之后,
推力::减少
的执行时间显著增加,这让我感到困惑

以下是示例代码:

#include <iostream>
#include <stack>
#include <ctime>

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>
#include <thrust/for_each.h>
#include <curand.h>
#include <curand_kernel.h>

struct tic_toc{
    std::stack<clock_t> tictoc_stack;
    inline void tic() { tictoc_stack.push(clock());}
    inline void toc() {
        std::cout << "Time elapsed: "
            << ((double)(clock() - tictoc_stack.top())) / CLOCKS_PER_SEC << "s"
            << std::endl;
        tictoc_stack.pop();
    }
};

struct curand_setup{
    using init_tuple = thrust::tuple<int, curandState &>;
    const unsigned long long seed;
    curand_setup(unsigned long long _seed) : seed(_seed) {}
    __device__ void operator()(init_tuple t){
        curandState s;
        int id = thrust::get<0>(t);
        curand_init(seed, id, 0, &s);
        thrust::get<1>(t) = s;
    }
};

int main(int argc, char** argv){
    int N = 1<<18;
    std::cout << "N " << N << std::endl;
    tic_toc tt;

    thrust::device_vector<float> val(N,1);

    tt.tic();
    float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    thrust::device_vector<curandState> rand_state(N);
    auto rand_init_it = thrust::make_zip_iterator(
            thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
    thrust::for_each_n(rand_init_it, N, curand_setup(0));

    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    tt.tic();
    mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
    tt.toc();

    return 0;
}
当我为求和编写自己的内核时,或者我将数据复制到
asch::host_vector
并减少它们时,情况并没有改变

为什么在初始化
推力::设备_向量
之后,推力::减少
的速度如此之慢,有没有办法避免这个问题?我将非常感谢你的帮助

我的系统是
LinuxMint18.3
,内核
4.15.0-23-generic

nvcc的输出--version

nvcc:NVIDIA(R)Cuda编译器驱动程序
版权所有(c)2005-2015英伟达公司
建于2015年8月11日星期二14:27:32
Cuda编译工具,7.5版,V7.5.17

为什么在初始化之后,
struch::reduce
的速度如此之慢 推力:设备矢量

事实并非如此。你困惑的根源是你的时间测量,这是不正确的

通常,在设备上运行的推力API调用在主机上是异步的。唯一的例外是返回值的调用(而
推力::减少
就是其中之一)。因此,代码中的中间调用不仅测量了
stress::reduce
的执行时间,而且还测量了每个\u n
调用之前的
stress::for,而之前的调用要慢得多

您可以通过两种方式向自己确认这一点。如果您这样修改推力代码:

tt.tic();
float mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();

thrust::device_vector<curandState> rand_state(N);
auto rand_init_it = thrust::make_zip_iterator(
        thrust::make_tuple(thrust::counting_iterator<int>(0),rand_state.begin()));
thrust::for_each_n(rand_init_it, N, curand_setup(0));
cudaDeviceSynchronize(); // wait until for_each is complete

tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();

tt.tic();
mean=thrust::reduce(val.begin(),val.end(),0.f,thrust::plus<float>())/N;
tt.toc();
$ nvcc -arch=sm_52 -std=c++11 -o slow_thrust slow_thrust.cu 
$ ./slow_thrust 
N 262144
Time elapsed: 0.000471s
Time elapsed: 0.000621s
Time elapsed: 0.000448s
i、 e.使用
cudaDeviceSynchronize()
捕获前一个调用的运行时时,所有reduce调用的运行时大致相同。或者,您可以在原始代码上使用分析工具,例如:

$ nvprof --print-gpu-trace ./slow_thrust
N 262144
==7870== NVPROF is profiling process 7870, command: ./slow_thrust
Time elapsed: 0.000521s
Time elapsed: 0.06983s
Time elapsed: 0.000538s
==7870== Profiling application: ./slow_thrust
==7870== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
214.30ms  7.6800us            (512 1 1)       (256 1 1)         8        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
214.56ms  5.8550us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
214.58ms  2.7200us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
214.60ms  1.1840us                    -               -         -         -         -        4B  3.2219MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
214.98ms  221.27us            (512 1 1)       (256 1 1)        20        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
219.51ms  69.492ms            (512 1 1)       (256 1 1)       108        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
289.00ms  9.5360us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
289.01ms  3.4880us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
289.07ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
289.66ms  9.9520us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
289.68ms  3.3280us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
289.69ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
$nvprof——打印gpu跟踪。/slow\u
N 262144
==7870==NVPROF正在分析过程7870,命令:./slow\u推力
所用时间:0.000521s
所用时间:0.06983s
所用时间:0.000538s
==7870==仿形应用:./slow\u推力
==7870==分析结果:
开始持续时间网格大小块大小Regs*SSMem*DSMem*大小吞吐量SrcMemType DstMemType设备上下文流名称
214.30ms 7.6800us(512 11)(256 11)8 0B 0B--GeForce GTX 970 1 7无效推力::cuda_cub::核心::_内核_代理(推力::设备_ptr,浮点)[109]
214.56ms 5.8550us(52 1 1)(256 1 1)29 44B 0B-----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceReduceKernel(int,float,推力::plus,推力::cuda_cub::GridEvenShare,推力::cuda_cub::cub::DeviceReducePolicy::policy 600)[128]
214.58ms 2.7200us(11)(256 11)27 44B 0B----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceReduceSingleTileKernel(int,float,推力::plus,推力::cuda_cub::cub::DeviceReducePolicy::Policy 600,float*)[136]
214.60ms 1.1840us----4B 3.2219MB/s设备可分页GeForce GTX 970 1 7[CUDA memcpy DtoH]
214.98ms 221.27us(512 1 1)(256 1 1 1)20 0B 0B--GeForce GTX 970 1 7无效推力::cuda_cub::核心::_内核_代理(推力::设备_ptr,CurandStateXorow)[151]
219.51ms 69.492ms(512 11)(256 11)108 0B 0B-----GeForce GTX 970 1 7无效推力::cuda_cub::内核::内核代理(推力::使用默认值,推力::使用默认值)[160]
289.00ms 9.5360us(52 1 1)(256 1 1)29 44B 0B-----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceReduceKernel(int,float,推力::plus,推力::cuda_cub::GridEvenShare,推力::cuda_cub::cub::DeviceReducePolicy::policy 600)[179
289.01ms 3.4880us(11)(256 11)27 44B 0B----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceDuceSingleTilekernel(int,float,推力::plus,推力::cuda_cub::cub::DeviceDucePolicy::Policy 600,float*)[187]
289.07ms 1.3120us----4B 2.9075MB/s设备可分页GeForce GTX 970 1 7[CUDA memcpy DtoH]
289.66ms 9.9520us(52 1 1)(256 1 1)29 44B 0B----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceReduceKernel(int,float,推力::plus,推力::cuda_cub::GridEvenShare,推力::cuda_cub::cub::DeviceReducePolicy::policy 600)[211
289.68ms 3.3280us(1 1 1)(256 1 1)27 44B 0B----GeForce GTX 970 1 7无效推力::cuda_cub::cub::DeviceDuceSingleTilekernel(int,float,推力::plus,推力::cuda_cub::cub::DeviceDucePolicy::Policy 600,float*)[219]
289.69ms 1.3120us----4B 2.9075MB/s设备可分页GeForce GTX 970 1 7[CUDA memcpy DtoH]

在这里,您可以看到,组成reduce操作的三个调用每次累计需要8-13微秒,而每个调用的
需要69毫秒才能完成。

现在我知道发生了什么,谢谢
$ nvprof --print-gpu-trace ./slow_thrust
N 262144
==7870== NVPROF is profiling process 7870, command: ./slow_thrust
Time elapsed: 0.000521s
Time elapsed: 0.06983s
Time elapsed: 0.000538s
==7870== Profiling application: ./slow_thrust
==7870== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput  SrcMemType  DstMemType           Device   Context    Stream  Name
214.30ms  7.6800us            (512 1 1)       (256 1 1)         8        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<float>, float>, unsigned long>(thrust::device_ptr<float>, float) [109]
214.56ms  5.8550us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [128]
214.58ms  2.7200us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [136]
214.60ms  1.1840us                    -               -         -         -         -        4B  3.2219MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
214.98ms  221.27us            (512 1 1)       (256 1 1)        20        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>, thrust::cuda_cub::__uninitialized_fill::functor<thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW>, unsigned long>(thrust::device_ptr<curandStateXORWOW>, curandStateXORWOW) [151]
219.51ms  69.492ms            (512 1 1)       (256 1 1)       108        0B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>, thrust::cuda_cub::for_each_f<thrust::zip_iterator<thrust::tuple<thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, thrust::detail::normal_iterator<thrust::device_ptr<curandStateXORWOW>>, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>>, thrust::detail::wrapped_function<curand_setup, void>>, int>(thrust::use_default, thrust::use_default) [160]
289.00ms  9.5360us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [179]
289.01ms  3.4880us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [187]
289.07ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]
289.66ms  9.9520us             (52 1 1)       (256 1 1)        29       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, thrust::detail::normal_iterator<thrust::device_ptr<float>>, float*, int, thrust::plus<float>>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::GridEvenShare<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600) [211]
289.68ms  3.3280us              (1 1 1)       (256 1 1)        27       44B        0B         -           -           -           -  GeForce GTX 970         1         7  void thrust::cuda_cub::cub::DeviceReduceSingleTileKernel<thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*, thrust::detail::normal_iterator<thrust::pointer<float, thrust::cuda_cub::tag, thrust::use_default, thrust::use_default>>, int, thrust::plus<float>, float>(int, float, thrust::plus<float>, thrust::cuda_cub::cub::DeviceReducePolicy<float, int, thrust::plus<float>>::Policy600, float*) [219]
289.69ms  1.3120us                    -               -         -         -         -        4B  2.9075MB/s      Device    Pageable  GeForce GTX 970         1         7  [CUDA memcpy DtoH]