CUDA和libc中对象的共享内存++；abi.dylib错误_Cuda_Shared Memory_Thrust

CUDA和libc中对象的共享内存++；abi.dylib错误

cuda

CUDA和libc中对象的共享内存++；abi.dylib错误,cuda,shared-memory,thrust,Cuda,Shared Memory,Thrust,我有以下问题（请记住，我对使用CUDA编程相当陌生）我有一个名为vec3f的类，它与float3数据类型类似，但具有重载运算符和其他向量函数。这些函数的前缀是_uuu设备_uuu主机uuu（我添加了空格，因为它将这些单词加粗）。然后，在我的内核中，我在block_x和block_y上做了一个嵌套for循环，并做了如下操作 //set up shared memory block extern __shared__ vec3f share[]; vec3f *sh_pos = share; ve

我有以下问题（请记住，我对使用CUDA编程相当陌生）

我有一个名为vec3f的类，它与float3数据类型类似，但具有重载运算符和其他向量函数。这些函数的前缀是_uuu设备_uuu主机uuu（我添加了空格，因为它将这些单词加粗）。然后，在我的内核中，我在block_x和block_y上做了一个嵌套for循环，并做了如下操作

//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();

在上面的代码中，oldParticles是一个指向一个名为particles的类的指针，该类被传递到内核。OldParticles实际上是一个推力：：设备_向量的底层指针（我不确定这是否与此有关）。一切都可以编译，但当我运行时，我得到了错误

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

谢谢你的回复。我认为这个错误与我没有为传递给内核的参数分配空间有关。在主机代码中执行以下操作修复了此错误

particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );

内核调用是

force_kernel<<< grid,block,sharedMemSize  >>>(particle_ptrs[0],particle_ptrs[1],time_step);

可能是什么问题？谢谢。

您发布的代码存在各种问题

您的
```
块
```
和
```
网格
```
变量在内核调用中被反转<代码>网格排在第一位
您应该在内核和运行时API调用上进行操作

在从空设备向量原始转换的指针上使用

cudamaloc

分配存储的方法是不明智的。向量容器不知道您是在“幕后”这样做的。相反，您可以在实例化设备向量时直接为其分配存储，如：

thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);

libc++异常正在发生。frpm。您的主机代码，而不是gpu上的代码。您可能在错误的位置查找问题。您是否使用

gdb

跟踪错误的来源？您应该添加解决方案作为此问题的答案。然后，您可以接受该答案，将其标记为已回答，以供所有人查看。是的，您可以修改内核中推力设备向量中包含的数据，将这些数据项复制/保存到已分配到适当大小的另一个推力设备向量，然后将结果复制回主机。我不清楚你是否还有问题，或者你的问题现在是否已经解决了。@RobertCrovella我想我不知道你是怎么做到的。很明显（当我试图像上面的例子那样做的时候）有点不对劲。谢谢你非常有用的回答。我真的很感激。我想知道您是否可以评论一下为什么我不能全局声明设备和主机向量（作为全局变量）。更具体地说，为什么我不能放置

推力：：设备向量新的部分（最大粒子），旧的部分（最大粒子）

以及

particles*particle\u ptrs[2]

在main（）之外，然后在main（）中，只需执行

old\u parts=h\u parts

以及相同的指针铸件。当我这样做的时候，它会出错。我不是推力在引擎盖下的专家。当您将

device\u vector

放置在main之外时，会出现各种问题，其中一些问题已经讨论过。实际上，您已经涵盖了这些，所以您只需要在main之外声明设备向量（没有大小），然后在main的开头

.resize（max\u particles）

它们，它应该可以工作，其他一切都与我发布的代码相同。或者发布一个新问题，我无法在评论中涵盖。

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() : 
    _velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
  };
  particle(const vec3f& pos, const vec3f& vel) :
    _position(pos), _velocity(vel), _density(0.0) {
  };

  vec3f _velocity;
  vec3f _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);

//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];

int main() {
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
  }

  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
  cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
  //copy host particles to old device particles...
  old_parts = h_parts;
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
  kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position << endl;
  }  
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();
  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;

  new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
  new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
  new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}

  //copy new device particles back to host particles
  h_parts = new_parts;

thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>

const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() :
    _velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
 {
  };
  particle(const float3& pos, const float3& vel) :
    _position(pos), _velocity(vel), _density(0.0)
 {
  };

  float3 _velocity;
  float3 _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);


int main() {
  //global thrust vectors
  thrust::host_vector<particle> h_parts;
  particle* particle_ptrs[2];
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
  }

  //copy host particles to old device particles...
  thrust::device_vector<particle> old_parts = h_parts;
  thrust::device_vector<particle> new_parts(max_particles);
  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
  cout << "grid x: " << grid.x << "  grid y: "  << grid.y << endl;
  kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  cudaDeviceSynchronize();
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
  }
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();

  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
  if (arr_pos < max_particles) {

    new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
    new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
    new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
    new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
    new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
    new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
    new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
  }
}