如何使用CUDA C快速压缩稀疏阵列？总结_Cuda_Gpgpu_Sparse Array

如何使用CUDA C快速压缩稀疏阵列？总结

cuda

如何使用CUDA C快速压缩稀疏阵列？总结,cuda,gpgpu,sparse-array,Cuda,Gpgpu,Sparse Array,阵列[A-B----C]在设备内存中，但需要[A-B-C]-使用CUDA C最快的方法是什么上下文我在设备（GPU）内存中有一个整数数组。在每次迭代中，我随机选择几个大于0的元素并从中减去1。我维护那些等于0的元素的排序查找数组L： Array A: @ iteration i: [0 1 0 3 3 2 0 1 2 3] @ iteration i + 1: [0 0 0 3 2 2 0 1 2 3] Lookup for 0-elements L: @

阵列

[A-B----C]

在设备内存中，但需要

[A-B-C]

-使用CUDA C最快的方法是什么

上下文我在设备（GPU）内存中有一个整数数组。在每次迭代中，我随机选择几个大于0的元素并从中减去1。我维护那些等于0的元素的排序查找数组

：

Array A:
       @ iteration i: [0 1 0 3 3 2 0 1 2 3]
   @ iteration i + 1: [0 0 0 3 2 2 0 1 2 3]

Lookup for 0-elements L:
       @ iteration i: [0 - 2 - - - 6 - - -]  ->  want compacted form: [0 2 6]
   @ iteration i + 1: [0 1 2 - - - 6 - - -]  ->  want compacted form: [0 1 2 6]

（在这里，我随机选择元素

和

来减去1。在我在CUDA C中的实现中，每个线程映射到

中的一个元素上，因此查找数组是稀疏的，以防止数据争用和保持排序顺序（例如

[01 2 6]

而不是

[02 6 1]

）

稍后，我将只对那些等于0的元素执行一些操作。因此，我需要压缩稀疏查找数组

，以便将线程映射到0元素

因此，使用CUDA C压缩设备内存上的稀疏阵列最有效的方法是什么

非常感谢。

假设我有：

int V[] = {1, 2, 0, 0, 5};

我期望的结果是：

int R[] = {1, 2, 5}

实际上，我们正在删除零元素，或者仅在非零的情况下复制元素

#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <stdio.h>
#define SIZE 5

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

  struct is_not_zero
  {
    __host__ __device__
    bool operator()(const int x)
    {
      return (x != 0);
    }
  };



int main(){

  int V[] = {1, 2, 0, 0, 5};
  int R[] = {0, 0, 0, 0, 0};
  int *d_V, *d_R;

  cudaMalloc((void **)&d_V, SIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc1 fail");
  cudaMalloc((void **)&d_R, SIZE*sizeof(int));
  cudaCheckErrors("cudaMalloc2 fail");

  cudaMemcpy(d_V, V, SIZE*sizeof(int), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy1 fail");

  thrust::device_ptr<int> dp_V(d_V);
  thrust::device_ptr<int> dp_R(d_R);
  thrust::copy_if(dp_V, dp_V + SIZE, dp_R, is_not_zero());

  cudaMemcpy(R, d_R, SIZE*sizeof(int), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy2 fail");

  for (int i = 0; i<3; i++)
    printf("R[%d]: %d\n", i, R[i]);

  return 0;


}

#包括
#包括
#包括
#定义尺寸5
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
结构不是零
{
__主机设备__
布尔运算符（）（常量int x）
{
返回（x！=0）；
}
};
int main（）{
int V[]={1,2,0,0,5}；
int R[]={0,0,0,0,0}；
int*d_V，*d_R；
Cudamaloc（（空心**）和d_V，尺寸*尺寸（内部））；
CUDACHECKERRS（“cudaMalloc1失败”）；
Cudamaloc（（空心**）和d_R，尺寸*尺寸（内部））；
CUDACHECKERRS（“cudaMalloc2失败”）；
cudaMemcpy（d_V，V，SIZE*sizeof（int），cudamemcpyhostodevice）；
cudaCheckErrors（“cudaMemcpy1失败”）；
推力：装置推力dp_V（d_V）；
推力：装置压头压头（压头）；
推力：如果（dp_V，dp_V+大小，dp_R，不是零（））复制_；
cudaMemcpy（R，d_R，SIZE*sizeof（int），cudaMemcpyDeviceToHost）；
cudaCheckErrors（“cudaMemcpy2失败”）；
对于It i＝0；谢谢。-推力是否与标准的CUDA安装一起出现？因为我不是系统管理员，如果库是可用的，我如何检查UNIX机器？谢谢。是的，假设最近版本的CUDA。如果你有一个目录，比如“代码> /Ur/Posi/CUDA/包含/推力< /代码>，那么你有THR。推力是完全模板化的/包含代码，所以没有普通的库需要担心。谢谢RobertCrovella。但是我看不到C用户的任何示例用法。只有C++，我不熟悉。例如，你怎么称呼<代码>推力：：

在CUDA C？库的设备内存中的数组上提供

cusparseSdense2csr（）

来将矩阵从密集格式转换为稀疏格式。它应该非常有效，但效率可能低于

asch:：copy\u if

谢谢。我尝试了您的解决方案，但出现编译错误：

[…]/cuda/4.0.17/cuda/bin//包括/推力/细节/设备/cuda/副本‌f、 inl（71）：错误：重载函数“min”的多个实例与参数列表匹配：函数“min（int，int）”函数“min（unsigned int，unsigned int）”[…]参数类型是：（long，const long）在“void-推力：：细节：：设备：：cuda：：减少间隔”实例化期间检测到的（输入迭代器、索引类型、索引类型、输出迭代器、二进制函数）[…]

其中，

[…]

是截断。你完全按照我发布的代码编译了它？或者你做了任何更改或添加了吗？看起来你在使用CUDA 4.0。我已经在CUDA 4.2和CUDA 5.0上测试了它，但不是4.0。汉克斯·罗伯特，我完全按照你发布的代码编译了。你知道CUDA 4.0为什么抱怨吗？CUDA 4很旧。超过2年了ars现在已经旧了。请尝试将-m32添加到nvcc编译命令行。谢谢。现在我得到了错误：

在/usr/include/features.h:371，from[…]/cuda/4.0.17/cuda/bin/。/include/host_config.h:114，from[…]/cuda/4.0.17/cuda/bin/./include/cuda_runtime.h:59，from:0:/usr/include/gnu/stubs.h:7:27:error:gnu/stubs-32.h:没有这样的文件或目录

。感谢您的耐心等待。

#include <thrust/device_ptr.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <stdio.h>
#define SIZE 5

  struct is_not_zero
  {
    __host__ __device__
    bool operator()(const int x)
    {
      return (x != 0);
    }
  };



int main(){

  int V[] = {1, 2, 0, 0, 5};
  int R[] = {0, 0, 0, 0, 0};

  thrust::host_vector<int> h_V(V, V+SIZE);
  thrust::device_vector<int> d_V = h_V;
  thrust::device_vector<int> d_R(SIZE, 0);

  thrust::copy_if(d_V.begin(), d_V.end(), d_R.begin(), is_not_zero());
  thrust::host_vector<int> h_R = d_R;

  thrust::copy(h_R.begin(), h_R.end(), R);

  for (int i = 0; i<3; i++)
    printf("R[%d]: %d\n", i, R[i]);

  return 0;


}