异步工作组拷贝比opencl内核中的手动拷贝慢

异步工作组拷贝比opencl内核中的手动拷贝慢,opencl,Opencl,在我的opencl内核中,我试图将比其他部分更容易访问的部分数据复制到本地内存中。要做到这一点,一开始我只是手动分配这些值。复印件如下: __global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]); __global LinearBVHInteriorNo

在我的opencl内核中,我试图将比其他部分更容易访问的部分数据复制到本地内存中。要做到这一点,一开始我只是手动分配这些值。复印件如下:

    __global LinearBVHInteriorNode* subtree_globalptr           =  &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
    __global LinearBVHInteriorNodeInfo* subtree_info_globalptr  =  &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);

    int last_subtree_interior_idx  = subtree_info_globalptr->last_interior;

    int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;

    int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
    int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);

    if(thread_local_idx == 0){
      subtree.interior_base_idx = subtree_info_globalptr[1].parent;
      subtree.leaf_base_idx     = subtree_info_globalptr[0].leaf_lowidx;
      subtree.tree              = accelstruct;
    }

    // Copy the data for interior nodes from the global memory to local memory
    for(int i = interior_nodes_beg_idx; i <= interior_nodes_end_idx; i++){

      subtree.interior_nodes[ i ].left_bound.min  = subtree_globalptr[ i ].left_bound.min;
      subtree.interior_nodes[ i ].left_bound.max  = subtree_globalptr[ i ].left_bound.max;
      subtree.interior_nodes[ i ].right_bound.min = subtree_globalptr[ i ].right_bound.min;
      subtree.interior_nodes[ i ].right_bound.max = subtree_globalptr[ i ].right_bound.max;
      subtree.interior_nodes[ i ].children[0]     = subtree_globalptr[ i ].children[0];
      subtree.interior_nodes[ i ].children[1]     = subtree_globalptr[ i ].children[1];
      subtree.interior_nodes[ i ].splitAxis       = subtree_globalptr[ i ].splitAxis;
    }

    int leafnodes_lowidx                                = subtree_info_globalptr->leaf_lowidx;
    int leafnodes_highidx                               = subtree_info_globalptr->leaf_highidx;
    int n_subtree_leaf_nodes                            = leafnodes_highidx - leafnodes_lowidx;
    __global LinearBVHLeafNode* subtree_leaf_globalptr =  &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);

    int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
    int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);

    // Copy the data for leaf nodes from the global memory to local memory
    for(int i = leaf_nodes_beg_idx; i < leaf_nodes_end_idx; i++){

      subtree.leaf_nodes[ i ].lowIdx  = subtree_leaf_globalptr[ i ].lowIdx;
      subtree.leaf_nodes[ i ].highIdx = subtree_leaf_globalptr[ i ].highIdx;
    }

  // Wait all the threads to finish the copying task.
  barrier(CLK_LOCAL_MEM_FENCE);
typedef struct {
  AABB      left_bound, right_bound;
  int       children[2];
  ushort    splitAxis;
  int       pad;

} LinearBVHInteriorNode;

typedef struct {
  int       parent;
  int       leaf_lowidx, leaf_highidx;
  int       last_interior;

} LinearBVHInteriorNodeInfo;

typedef struct {
  int  lowIdx, highIdx;
} LinearBVHLeafNode;

typedef struct {
  int  parent;
} LinearBVHLeafNodeInfo;

typedef struct {
  __global  LinearBVHInteriorNode*      interior_nodes;         //!< a pointer to interior nodes
  __global  LinearBVHInteriorNodeInfo*  interior_nodes_info;    //!< a pointer to information of interior nodes
  __global  LinearBVHLeafNode*          leaf_nodes;             //!< a pointer to leaf nodes 
  __global  LinearBVHLeafNodeInfo*      leaf_nodes_info;        //!< a pointer to information of leaf nodes
  uint      n_interior_nodes;                                   //!< Number of interior nodes
  uint      n_leaf_nodes;                                       //!< Number of leaf nodes
} BVHTree;


typedef struct {
  __local LinearBVHInteriorNode* interior_nodes;
  __local LinearBVHLeafNode*     leaf_nodes;

  int     interior_base_idx, leaf_base_idx;

  __global BVHTree*              tree;
} local_BVHTree;
但是它在
1.0s
中执行复制,这非常奇怪。我使用的结构定义如下:

    __global LinearBVHInteriorNode* subtree_globalptr           =  &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
    __global LinearBVHInteriorNodeInfo* subtree_info_globalptr  =  &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);

    int last_subtree_interior_idx  = subtree_info_globalptr->last_interior;

    int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;

    int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
    int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);

    if(thread_local_idx == 0){
      subtree.interior_base_idx = subtree_info_globalptr[1].parent;
      subtree.leaf_base_idx     = subtree_info_globalptr[0].leaf_lowidx;
      subtree.tree              = accelstruct;
    }

    // Copy the data for interior nodes from the global memory to local memory
    for(int i = interior_nodes_beg_idx; i <= interior_nodes_end_idx; i++){

      subtree.interior_nodes[ i ].left_bound.min  = subtree_globalptr[ i ].left_bound.min;
      subtree.interior_nodes[ i ].left_bound.max  = subtree_globalptr[ i ].left_bound.max;
      subtree.interior_nodes[ i ].right_bound.min = subtree_globalptr[ i ].right_bound.min;
      subtree.interior_nodes[ i ].right_bound.max = subtree_globalptr[ i ].right_bound.max;
      subtree.interior_nodes[ i ].children[0]     = subtree_globalptr[ i ].children[0];
      subtree.interior_nodes[ i ].children[1]     = subtree_globalptr[ i ].children[1];
      subtree.interior_nodes[ i ].splitAxis       = subtree_globalptr[ i ].splitAxis;
    }

    int leafnodes_lowidx                                = subtree_info_globalptr->leaf_lowidx;
    int leafnodes_highidx                               = subtree_info_globalptr->leaf_highidx;
    int n_subtree_leaf_nodes                            = leafnodes_highidx - leafnodes_lowidx;
    __global LinearBVHLeafNode* subtree_leaf_globalptr =  &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);

    int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
    int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);

    // Copy the data for leaf nodes from the global memory to local memory
    for(int i = leaf_nodes_beg_idx; i < leaf_nodes_end_idx; i++){

      subtree.leaf_nodes[ i ].lowIdx  = subtree_leaf_globalptr[ i ].lowIdx;
      subtree.leaf_nodes[ i ].highIdx = subtree_leaf_globalptr[ i ].highIdx;
    }

  // Wait all the threads to finish the copying task.
  barrier(CLK_LOCAL_MEM_FENCE);
typedef struct {
  AABB      left_bound, right_bound;
  int       children[2];
  ushort    splitAxis;
  int       pad;

} LinearBVHInteriorNode;

typedef struct {
  int       parent;
  int       leaf_lowidx, leaf_highidx;
  int       last_interior;

} LinearBVHInteriorNodeInfo;

typedef struct {
  int  lowIdx, highIdx;
} LinearBVHLeafNode;

typedef struct {
  int  parent;
} LinearBVHLeafNodeInfo;

typedef struct {
  __global  LinearBVHInteriorNode*      interior_nodes;         //!< a pointer to interior nodes
  __global  LinearBVHInteriorNodeInfo*  interior_nodes_info;    //!< a pointer to information of interior nodes
  __global  LinearBVHLeafNode*          leaf_nodes;             //!< a pointer to leaf nodes 
  __global  LinearBVHLeafNodeInfo*      leaf_nodes_info;        //!< a pointer to information of leaf nodes
  uint      n_interior_nodes;                                   //!< Number of interior nodes
  uint      n_leaf_nodes;                                       //!< Number of leaf nodes
} BVHTree;


typedef struct {
  __local LinearBVHInteriorNode* interior_nodes;
  __local LinearBVHLeafNode*     leaf_nodes;

  int     interior_base_idx, leaf_base_idx;

  __global BVHTree*              tree;
} local_BVHTree;
typedef结构{
AABB左界,右界;
智力儿童[2];
ushort分裂轴;
int pad;
}linearbvh内部节点;
类型定义结构{
int父代;
int leaf_lowidx,leaf_highidx;
int last_内部;
}LinearBVHInteriorNodeInfo;
类型定义结构{
int-lowIdx,highIdx;
}线形叶节;
类型定义结构{
int父代;
}LinearBVHLeafNodeInfo;
类型定义结构{
__全局线性RBVHInteriorNode*内部节点;//!<指向内部节点的指针
__全局线性RBVHInteriorNodeInfo*内部节点信息;//!<指向内部节点信息的指针
__全局线性RBVHleafNode*叶节点;//!<指向叶节点的指针
__全局线性RBVHleafNodeInfo*叶节点信息;//!<指向叶节点信息的指针
uint n_内部节点;//!<内部节点数
uint n_叶节点;//!<叶节点数
}BVHTree;
类型定义结构{
__局部线性RBVHInteriorNode*内部节点;
__局部线性RBVHleafNode*叶节点;
int interior_base_idx、leaf_base_idx;
__全局BVHTree*树;
}本地树;

我在opencl内核中寻找像
memcpy
这样的函数。因为两个数组具有相同的结构,并且可以使用这样的指令进行复制。我认为
async\u work\u group\u copy
也在做类似的事情。有人遇到过同样的问题吗?

本地内存中的数据使用什么?您真的需要执行完整复制吗?我猜编译器可能意识到您并没有真正使用所有复制的数据,而是在避免复制。而在另一种方法中,由于“块复制”的限制,异步没有进行优化。@DarkZeros我试图复制边界卷层次结构的一部分,并且使用了它的大部分。此外,编译器无法猜测树的哪一部分将被使用或不被使用,因此它可以意识到不应该复制它。@DarkZeros同样,您认为复制32 KB的GPU需要100毫秒吗?我的代码中应该有错误。在我的例子中,
async\u work\u group\u copy
使内核在CPU上运行时比手动复制慢几倍。我在nvidia gpu上没有注意到这个问题。@doqtor您尝试过AMD gpu吗?我使用的是amd r9 280x。在CPU上,这是有意义的,因为CPU根本没有本地内存,但对于gpu,它应该会更快。您在本地内存中使用什么数据?您真的需要执行完整复制吗?我猜编译器可能意识到您并没有真正使用所有复制的数据,而是在避免复制。而在另一种方法中,由于“块复制”的限制,异步没有进行优化。@DarkZeros我试图复制边界卷层次结构的一部分,并且使用了它的大部分。此外,编译器无法猜测树的哪一部分将被使用或不被使用,因此它可以意识到不应该复制它。@DarkZeros同样,您认为复制32 KB的GPU需要100毫秒吗?我的代码中应该有错误。在我的例子中,
async\u work\u group\u copy
使内核在CPU上运行时比手动复制慢几倍。我在nvidia gpu上没有注意到这个问题。@doqtor您尝试过AMD gpu吗?我使用的是amd r9 280x。在CPU上,这是有意义的,因为CPU根本没有本地内存,但对于gpu,它应该更快。