如何在一个CUDA代码中使用CUB和推力

如何在一个CUDA代码中使用CUB和推力,cuda,thrust,cub,Cuda,Thrust,Cub,我正试图在我的“旧”推力代码中引入一些CUB,因此我从一个小示例开始,比较推力::reduce_by_key与CUB::DeviceReduce::ReduceByKey,两者都应用于推力::设备向量 代码的推力部分很好,但是CUB部分在CUB调用后崩溃,CUB部分天真地使用通过推力::raw_pointer_cast获得的原始指针。我输入了一个cudaDeviceSynchronize(),试图解决这个问题,但没有帮助。代码的CUB部分是从CUB网页上抄来的 在OSX上,运行时错误为: lib

我正试图在我的“旧”推力代码中引入一些CUB,因此我从一个小示例开始,比较
推力::reduce_by_key
CUB::DeviceReduce::ReduceByKey
,两者都应用于
推力::设备向量

代码的推力部分很好,但是CUB部分在CUB调用后崩溃,CUB部分天真地使用通过推力::raw_pointer_cast获得的原始指针。我输入了一个
cudaDeviceSynchronize()
,试图解决这个问题,但没有帮助。代码的CUB部分是从CUB网页上抄来的

在OSX上,运行时错误为:

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6 
terminate called after throwing an instance of 'thrust::system::system_error'
what():  an illegal memory access was encountered
在Linux上,运行时错误为:

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6 
terminate called after throwing an instance of 'thrust::system::system_error'
what():  an illegal memory access was encountered
cuda memcheck的前几行是:

========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
=========     at 0x00127010 in /home/sdettrick/codes/MCthrust/tests/../cub-1.3.2/cub/device/dispatch/../../block_range/block_range_reduce_by_key.cuh:1017:void cub::ReduceByKeyRegionKernel<cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, unsigned int*, float*, float*, int*, cub::ReduceByKeyScanTileState<float, int, bool=1>, cub::Equality, CustomSum, int>(unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int, cub::DeviceReduceByKeyDispatch<unsigned int*, unsigned int*, float*, float*, int*, cub::Equality, CustomSum, int>::PtxReduceByKeyPolicy, unsigned int*, int, cub::GridQueue<int>)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x7fff7dbb3e88 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=============CUDA-MEMCHECK
==========大小为4的无效全局写入
======在0x00127010 in/home/sdettrick/codes/mc推力/tests//cub-1.3.2/cub/device/dispatch/../block_range/block_range_range_reduce_by_key.cuh:1017:void cub::reducebykernel(无符号int*,float*,float*,int*,cub::Equality,CustomSum,int,cub::device::DeviceByDispatch::ptxReducebyPolicy::ptxReducebyPolicy,unsigned int*,int,cub::GridQueue)
=======按块(0,0,0)中的线程(0,0,0)
=======地址0x7fff7dbb3e88超出范围
=========在内核启动时将主机回溯保存到驱动程序入口点
不幸的是,我不太确定该怎么办

任何帮助都将不胜感激。我在英伟达开发区尝试过,但没有得到任何回应。下面是完整的示例代码。应使用CUDA 6.5和cub 1.3.2进行编译:

#include <iostream>
#include <thrust/sort.h>
#include <thrust/gather.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/discard_iterator.h>

#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>

//========================================
// for CUB:
struct CustomSum
{
    template <typename T>
    CUB_RUNTIME_FUNCTION __host__ __device__ __forceinline__
    //__host__ __device__ __forceinline__
    T operator()(const T &a, const T &b) const {
        return b+a;
    }
};
//========================================

int main()
{
  const int Nkey=20;
  int Nseg=9;
  int ikey[Nkey] = {0, 0, 0, 6, 8, 0, 2, 4, 6, 8, 1, 3, 5, 7, 8, 1, 3, 5, 7, 8}; 

  thrust::device_vector<unsigned int> key(ikey,ikey+Nkey);
  thrust::device_vector<unsigned int> keysout(Nkey);

  // Let's reduce x, by key:

  float xval[Nkey];
  for (int i=0; i<Nkey; i++) xval[i]=ikey[i]+0.1f;

  thrust::device_vector<float> x(xval,xval+Nkey);

  // First, sort x by key:

  thrust::sort_by_key(key.begin(),key.end(),x.begin());

  //---------------------------------------------------------------------
  std::cout<<"=================================================================="<<std::endl
       <<" THRUST reduce_by_key:"<<std::endl
       <<"=================================================================="<<std::endl;

  thrust::device_vector<float> output(Nseg,0.0f);

  thrust::reduce_by_key(key.begin(),
            key.end(),
            x.begin(),
            keysout.begin(),
            output.begin());

  for (int i=0;i<Nkey;i++) std::cout << x[i] <<" ";  std::cout<<std::endl;
  for (int i=0;i<Nkey;i++) std::cout << key[i] <<" ";  std::cout<<std::endl;
  for (int i=0;i<Nseg;i++) std::cout << output[i] <<" ";  std::cout<<std::endl;

  float ototal=thrust::reduce(output.begin(),output.end());
  float xtotal=thrust::reduce(x.begin(),x.end());
  std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;

  //---------------------------------------------------------------------
  std::cout<<"=================================================================="<<std::endl
       <<" CUB ReduceByKey:"<<std::endl
       <<"=================================================================="<<std::endl;


  unsigned int *d_keys_in   =thrust::raw_pointer_cast(&key[0]);
  float        *d_values_in =thrust::raw_pointer_cast(&x[0]);  
  unsigned int *d_keys_out  =thrust::raw_pointer_cast(&keysout[0]);
  float        *d_values_out=thrust::raw_pointer_cast(&output[0]);
  int          *d_num_segments=&Nseg;
  CustomSum   reduction_op;

  std::cout << "CUB input" << std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << key[i]  << " ";  std::cout<<std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << x[i] << " ";  std::cout<< std::endl;
  for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " ";  std::cout<< std::endl;
  for (int i=0; i<Nseg; ++i) std::cout << output[i] << " ";  std::cout<< std::endl;

  // Determine temporary device storage requirements
  void     *d_temp_storage = NULL;
  size_t   temp_storage_bytes = 0;
  cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);

  // Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);
  std::cout << "temp_storage_bytes = " << temp_storage_bytes << std::endl;

  // Run reduce-by-key
  cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, d_values_in, d_values_out, d_num_segments, reduction_op, Nkey);
  cudaDeviceSynchronize();

  std::cout << "CUB output" << std::endl;

  std::cout<<Nkey<<" "<<Nseg<<std::endl;
  std::cout<<key.size() << " "<<x.size() << " "<<keysout.size() << " "<<output.size() << std::endl;

  // At this point onward it dies:
  //libc++abi.dylib: terminate called throwing an exception
  //Abort trap: 6  

  // If the next line is uncommented, it crashes the Mac!
  for (int i=0; i<Nkey; ++i) std::cout << key[i]  << " ";  std::cout<<std::endl;
  // for (int i=0; i<Nkey; ++i) std::cout << x[i] << " ";  std::cout<< std::endl;
  // for (int i=0; i<Nkey; ++i) std::cout << keysout[i] << " ";  std::cout<< std::endl;
  // for (int i=0; i<Nseg; ++i) std::cout << output[i] << " ";  std::cout<< std::endl;
  cudaFree(d_temp_storage);

  ototal=thrust::reduce(output.begin(),output.end());
  xtotal=thrust::reduce(x.begin(),x.end());
  std::cout << "total="<< ototal <<", should be "<<xtotal<<std::endl;
  return 1;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括//或同等
//========================================
//对于CUB:
结构自定义和
{
模板
CUB\运行时\函数\主机\设备\强制内联__
//__主机设备强制内联__
T运算符()(常数T&a,常数T&b)常数{
返回b+a;
}
};
//========================================
int main()
{
常数int Nkey=20;
int Nseg=9;
int-ikey[Nkey]={0,0,0,6,8,0,2,4,6,8,1,3,5,7,8,1,3,5,7,8};
推力:设备矢量键(ikey,ikey+Nkey);
推力:装置矢量键输出(Nkey);
//让我们按键减少x:
浮动xval[Nkey];

对于(int i=0;i这是不合适的:

 int          *d_num_segments=&Nseg;
不能获取主机变量的地址并将其用作设备指针

而是这样做:

int *d_num_segments;
cudaMalloc(&d_num_segments, sizeof(int));
这将在设备上为数据大小(cub将写入的单个整数)分配空间,并将该分配的地址分配给
d_num_segments
变量。这将成为有效的设备指针


在(*普通、非UM)CUDA中,在设备代码中取消对主机地址的引用或在主机代码中取消对设备地址的引用是非法的。

谢谢!冒着声明明显错误的风险,还需要添加
cudaMemcpy(d_num_段,&Nseg,sizeof(int),cudaMemcpyHostToDevice);
以实现我试图实现的目标。我不知道为什么。CUB会写入该位置/值。它不是CUB的输入,而是CUB的输出。无论在那里写入什么,都会被CUB覆盖,因为它会计算找到的段数。请查看(NumSegmentsIterator)嗯,我原以为没有那句话我就错了,但现在我把它删除了,毕竟没有错。