Cuda 是否可以在映射阵列上使用推力：：设备ptr？_Cuda_Thrust

Cuda 是否可以在映射阵列上使用推力：：设备ptr？

cuda

Cuda 是否可以在映射阵列上使用推力：：设备ptr？,cuda,thrust,Cuda,Thrust,我试图在映射内存上使用推力：：copy_if函数。但是，由于我遇到了运行时错误，并且我无法找到它，在花费大量时间进行调试之前，我希望确认一个事实，即它可以有效地将指向映射内存位置的指针传递给推力：：设备\u ptr包装器以下是我的意思的一个例子： int size=1024; int* v_locked; int* v_device; int* stencil_device; device_ptr<int> v_wrapper; device_ptr<int> v_

我试图在映射内存上使用推力：：copy_if函数。但是，由于我遇到了运行时错误，并且我无法找到它，在花费大量时间进行调试之前，我希望确认一个事实，即它可以有效地将指向映射内存位置的指针传递给推力：：设备\u ptr包装器

以下是我的意思的一个例子：

int size=1024;

int* v_locked;
int* v_device;
int* stencil_device;

device_ptr<int> v_wrapper;
device_ptr<int> v_wrapper_end;
device_ptr<int> stencil_wrapper;

cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped));
cudaHostGetDevicePointer(&v_device, &v_locked, 0);

cudaMalloc((void**)&stencil_device, size*sizeof(int));
/* 
kernel assigning stencil_device elements ...
*/

v_wrapper = device_pointer_cast(v_device);
stencil_wrapper = device_pointer_cast(stencil_device);

v_wrapper_end = copy_if(make_counting_iterator<int>(0), make_counting_iterator<int>(size), stencil_wrapper, v_wrapper, _1 == 1);

int size=1024；
int*v_锁定；
int*v_装置；
int*模板设备；
设备ptr v_包装；
设备ptr v_包装器_端；
设备ptr模具包装；
cudaHostAlloc（（空隙**）和v_锁定，尺寸*尺寸（内部），cudaHostAllocMapped））；
cudaHostGetDevicePointer（&v_设备，&v_锁定，0）；
Cudamaloc（（void**）和模具装置，尺寸*尺寸（int））；
/* 
内核正在分配模具\u设备元素。。。
*/
v_包装器=设备\u指针\u转换（v_设备）；
模具包装=设备指针铸造（模具设备）；
v_wrapper_end=copy_if（make_counting_iterator（0）、make_counting_iterator（size）、stencil_wrapper、v_wrapper、_1==1）；

这是否正确使用了带推力库的映射内存

谢谢。

是的，这是可能的

我相信你的代码有几个问题

您似乎没有执行任何操作，如果您执行了，您可能会检测到，尽管对

cudaHostGetDevicePointer

的调用似乎编译正确，但设置不正确

如上所述，对

cudaHostGetDevicePointer（）

的调用设置不正确。第二个指针参数作为单指针（

）传递，而不是双指针（

**

）。引用所写的This调用将抛出您可以捕获的cuda运行时错误

在调用

cudaHostAlloc

之前，应使用

cudaSetDeviceFlags（cudadevicemaspost）调用
下面是一个示例代码，对我来说似乎工作正常，并且已修复了上述问题：
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>   

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
    __host__ __device__
    bool operator()(const T &x)
    {
        return (x==1);
    }
};

int main(){

  int size=1024;

  int* v_locked;
  int* v_device;
  int* stencil_locked;
  int* stencil_device;

  cudaSetDeviceFlags(cudaDeviceMapHost);
  cudaCheckErrors("cudaSetDeviceFlags");
  cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
  cudaCheckErrors("cudaHostAlloc 1");
  cudaHostGetDevicePointer(&v_device, v_locked, 0);
  cudaCheckErrors("cudaHostGetDevicePointer 1");
  cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
  cudaCheckErrors("cudaHostAlloc 2");
  cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
  cudaCheckErrors("cudaHostGetDevicePointer 2");

  for (int i = 0; i < size; i++){
    v_locked[i] = i;
    stencil_locked[i] = i%2;}

  thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
  thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
  thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
  thrust::device_vector<int> result(size);
  thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
  int result_size = result_end - result.begin();
  thrust::host_vector<int> h_result(result_size);
  thrust::copy_n(result.begin(), result_size, h_result.begin());
  thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  return 0;

}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

$cat t281.cu
#包括
#包括
#包括
#包括
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
模板
结构是一个：推力：：一元函数
{
__主机设备__
布尔运算符（）（常量T&x）
{
返回（x==1）；
}
};
int main（）{
int size=1024；
int*v_锁定；
int*v_装置；
int*模具锁定；
int*模板设备；
cudaSetDeviceFlags（cudaDeviceMapHost）；
CUDACHECKERS（“cudaSetDeviceFlags”）；
cudaHostAlloc（（空隙**）和v_锁定，尺寸*尺寸（内部），cudaHostAllocMapped）；
cudaCheckErrors（“cudaHostAlloc 1”）；
cudaHostGetDevicePointer（&v_设备，v_锁定，0）；
cudaCheckErrors（“cudaHostGetDevicePointer 1”）；
cudaHostAlloc（（空隙**）和模板锁定，尺寸*尺寸（内部），cudaHostAllocMapped）；
cudaCheckErrors（“cudaHostAlloc 2”）；
cudaHostGetDevicePointer（&stencil_设备，stencil_锁定，0）；
cudaCheckErrors（“cudaHostGetDevicePointer 2”）；
对于（int i=0；i
我相信你的代码有几个问题
您似乎没有执行任何操作，如果您执行了，您可能会检测到，尽管对cudaHostGetDevicePointer
的调用似乎编译正确，但设置不正确
如上所述，对cudaHostGetDevicePointer（）
的调用未正确设置。第二个指针参数作为单指针（*
）传递，而不是双指针（**
）。请参阅所编写的此调用将引发可捕获的cuda运行时错误
在调用cudaHostAlloc
之前，应使用cudaSetDeviceFlags（cudadevicemapost）；
调用
下面是一个示例代码，对我来说似乎工作正常，并且已修复了上述问题：
$ cat t281.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/copy.h>   

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

template<typename T>
struct is_one : thrust::unary_function<T, bool>
{
    __host__ __device__
    bool operator()(const T &x)
    {
        return (x==1);
    }
};

int main(){

  int size=1024;

  int* v_locked;
  int* v_device;
  int* stencil_locked;
  int* stencil_device;

  cudaSetDeviceFlags(cudaDeviceMapHost);
  cudaCheckErrors("cudaSetDeviceFlags");
  cudaHostAlloc((void**)&v_locked, size*sizeof(int), cudaHostAllocMapped);
  cudaCheckErrors("cudaHostAlloc 1");
  cudaHostGetDevicePointer(&v_device, v_locked, 0);
  cudaCheckErrors("cudaHostGetDevicePointer 1");
  cudaHostAlloc((void**)&stencil_locked, size*sizeof(int), cudaHostAllocMapped);
  cudaCheckErrors("cudaHostAlloc 2");
  cudaHostGetDevicePointer(&stencil_device, stencil_locked, 0);
  cudaCheckErrors("cudaHostGetDevicePointer 2");

  for (int i = 0; i < size; i++){
    v_locked[i] = i;
    stencil_locked[i] = i%2;}

  thrust::device_ptr<int> v_wrapper = thrust::device_pointer_cast(v_device);
  thrust::device_ptr<int> stencil_wrapper = thrust::device_pointer_cast(stencil_device);
  thrust::device_ptr<int> v_wrapper_end = v_wrapper + size;
  thrust::device_vector<int> result(size);
  thrust::device_vector<int>::iterator result_end = copy_if(v_wrapper, v_wrapper_end, stencil_wrapper, result.begin(), is_one<int>());
  int result_size = result_end - result.begin();
  thrust::host_vector<int> h_result(result_size);
  thrust::copy_n(result.begin(), result_size, h_result.begin());
  thrust::copy_n(h_result.begin(), 10, std::ostream_iterator<int>(std::cout, " "));
  std::cout << std::endl;
  return 0;

}
$ nvcc -arch=sm_20 -o t281 t281.cu
$ ./t281
1 3 5 7 9 11 13 15 17 19
$

$cat t281.cu
#包括
#包括
#包括
#包括
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
模板
结构是一个：推力：：一元函数
{
__主机设备__
布尔运算符（）（常量T&x）
{
返回（x==1）；
}
};
int main（）{
int size=1024；
int*v_锁定；
int*v_装置；
int*模具锁定；
int*模板设备；
cudaSetDeviceFlags（cudaDeviceMapHost）；
CUDACHECKERS（“cudaSetDeviceFlags”）；
cudaHostAlloc（（空隙**）和v_锁定，尺寸*尺寸（内部），cudaHostAllocMapped）；
cudaCheckErrors（“cudaHostAlloc 1”）；
cudaHostGetDevicePointer（&v_设备，v_锁定，0）；
cudaCheckErrors（“cudaHostGetDevicePointer 1”）；
cudaHostAlloc（（空隙**）和模板锁定，尺寸*尺寸（内部），cudaHostAllocMapped）；
cudaCheckErrors（“cudaHostAlloc 2”）；
cudaHostGetDevicePointer（&stencil_设备，stencil_锁定，0）；
CUDACHECKERS（“cud