异步工作组拷贝比opencl内核中的手动拷贝慢
在我的opencl内核中,我试图将比其他部分更容易访问的部分数据复制到本地内存中。要做到这一点,一开始我只是手动分配这些值。复印件如下:异步工作组拷贝比opencl内核中的手动拷贝慢,opencl,Opencl,在我的opencl内核中,我试图将比其他部分更容易访问的部分数据复制到本地内存中。要做到这一点,一开始我只是手动分配这些值。复印件如下: __global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]); __global LinearBVHInteriorNo
__global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
__global LinearBVHInteriorNodeInfo* subtree_info_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);
int last_subtree_interior_idx = subtree_info_globalptr->last_interior;
int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;
int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);
if(thread_local_idx == 0){
subtree.interior_base_idx = subtree_info_globalptr[1].parent;
subtree.leaf_base_idx = subtree_info_globalptr[0].leaf_lowidx;
subtree.tree = accelstruct;
}
// Copy the data for interior nodes from the global memory to local memory
for(int i = interior_nodes_beg_idx; i <= interior_nodes_end_idx; i++){
subtree.interior_nodes[ i ].left_bound.min = subtree_globalptr[ i ].left_bound.min;
subtree.interior_nodes[ i ].left_bound.max = subtree_globalptr[ i ].left_bound.max;
subtree.interior_nodes[ i ].right_bound.min = subtree_globalptr[ i ].right_bound.min;
subtree.interior_nodes[ i ].right_bound.max = subtree_globalptr[ i ].right_bound.max;
subtree.interior_nodes[ i ].children[0] = subtree_globalptr[ i ].children[0];
subtree.interior_nodes[ i ].children[1] = subtree_globalptr[ i ].children[1];
subtree.interior_nodes[ i ].splitAxis = subtree_globalptr[ i ].splitAxis;
}
int leafnodes_lowidx = subtree_info_globalptr->leaf_lowidx;
int leafnodes_highidx = subtree_info_globalptr->leaf_highidx;
int n_subtree_leaf_nodes = leafnodes_highidx - leafnodes_lowidx;
__global LinearBVHLeafNode* subtree_leaf_globalptr = &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);
int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);
// Copy the data for leaf nodes from the global memory to local memory
for(int i = leaf_nodes_beg_idx; i < leaf_nodes_end_idx; i++){
subtree.leaf_nodes[ i ].lowIdx = subtree_leaf_globalptr[ i ].lowIdx;
subtree.leaf_nodes[ i ].highIdx = subtree_leaf_globalptr[ i ].highIdx;
}
// Wait all the threads to finish the copying task.
barrier(CLK_LOCAL_MEM_FENCE);
typedef struct {
AABB left_bound, right_bound;
int children[2];
ushort splitAxis;
int pad;
} LinearBVHInteriorNode;
typedef struct {
int parent;
int leaf_lowidx, leaf_highidx;
int last_interior;
} LinearBVHInteriorNodeInfo;
typedef struct {
int lowIdx, highIdx;
} LinearBVHLeafNode;
typedef struct {
int parent;
} LinearBVHLeafNodeInfo;
typedef struct {
__global LinearBVHInteriorNode* interior_nodes; //!< a pointer to interior nodes
__global LinearBVHInteriorNodeInfo* interior_nodes_info; //!< a pointer to information of interior nodes
__global LinearBVHLeafNode* leaf_nodes; //!< a pointer to leaf nodes
__global LinearBVHLeafNodeInfo* leaf_nodes_info; //!< a pointer to information of leaf nodes
uint n_interior_nodes; //!< Number of interior nodes
uint n_leaf_nodes; //!< Number of leaf nodes
} BVHTree;
typedef struct {
__local LinearBVHInteriorNode* interior_nodes;
__local LinearBVHLeafNode* leaf_nodes;
int interior_base_idx, leaf_base_idx;
__global BVHTree* tree;
} local_BVHTree;
但是它在1.0s
中执行复制,这非常奇怪。我使用的结构定义如下:
__global LinearBVHInteriorNode* subtree_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes[subtree_interior_idx]);
__global LinearBVHInteriorNodeInfo* subtree_info_globalptr = &(((__global BVHTree*)accelstruct)->interior_nodes_info[subtree_interior_idx]);
int last_subtree_interior_idx = subtree_info_globalptr->last_interior;
int n_subtree_interior_nodes = last_subtree_interior_idx - subtree_interior_idx + 1;
int interior_nodes_beg_idx = thread_local_idx * ((n_subtree_interior_nodes / local_dim) + 1);
int interior_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_interior_nodes / local_dim) + 1), last_subtree_interior_idx);
if(thread_local_idx == 0){
subtree.interior_base_idx = subtree_info_globalptr[1].parent;
subtree.leaf_base_idx = subtree_info_globalptr[0].leaf_lowidx;
subtree.tree = accelstruct;
}
// Copy the data for interior nodes from the global memory to local memory
for(int i = interior_nodes_beg_idx; i <= interior_nodes_end_idx; i++){
subtree.interior_nodes[ i ].left_bound.min = subtree_globalptr[ i ].left_bound.min;
subtree.interior_nodes[ i ].left_bound.max = subtree_globalptr[ i ].left_bound.max;
subtree.interior_nodes[ i ].right_bound.min = subtree_globalptr[ i ].right_bound.min;
subtree.interior_nodes[ i ].right_bound.max = subtree_globalptr[ i ].right_bound.max;
subtree.interior_nodes[ i ].children[0] = subtree_globalptr[ i ].children[0];
subtree.interior_nodes[ i ].children[1] = subtree_globalptr[ i ].children[1];
subtree.interior_nodes[ i ].splitAxis = subtree_globalptr[ i ].splitAxis;
}
int leafnodes_lowidx = subtree_info_globalptr->leaf_lowidx;
int leafnodes_highidx = subtree_info_globalptr->leaf_highidx;
int n_subtree_leaf_nodes = leafnodes_highidx - leafnodes_lowidx;
__global LinearBVHLeafNode* subtree_leaf_globalptr = &(((__global BVHTree*)accelstruct)->leaf_nodes[leafnodes_lowidx]);
int leaf_nodes_beg_idx = thread_local_idx * ((n_subtree_leaf_nodes / local_dim) + 1);
int leaf_nodes_end_idx = min((thread_local_idx + 1) * ((n_subtree_leaf_nodes / local_dim) + 1), leafnodes_highidx);
// Copy the data for leaf nodes from the global memory to local memory
for(int i = leaf_nodes_beg_idx; i < leaf_nodes_end_idx; i++){
subtree.leaf_nodes[ i ].lowIdx = subtree_leaf_globalptr[ i ].lowIdx;
subtree.leaf_nodes[ i ].highIdx = subtree_leaf_globalptr[ i ].highIdx;
}
// Wait all the threads to finish the copying task.
barrier(CLK_LOCAL_MEM_FENCE);
typedef struct {
AABB left_bound, right_bound;
int children[2];
ushort splitAxis;
int pad;
} LinearBVHInteriorNode;
typedef struct {
int parent;
int leaf_lowidx, leaf_highidx;
int last_interior;
} LinearBVHInteriorNodeInfo;
typedef struct {
int lowIdx, highIdx;
} LinearBVHLeafNode;
typedef struct {
int parent;
} LinearBVHLeafNodeInfo;
typedef struct {
__global LinearBVHInteriorNode* interior_nodes; //!< a pointer to interior nodes
__global LinearBVHInteriorNodeInfo* interior_nodes_info; //!< a pointer to information of interior nodes
__global LinearBVHLeafNode* leaf_nodes; //!< a pointer to leaf nodes
__global LinearBVHLeafNodeInfo* leaf_nodes_info; //!< a pointer to information of leaf nodes
uint n_interior_nodes; //!< Number of interior nodes
uint n_leaf_nodes; //!< Number of leaf nodes
} BVHTree;
typedef struct {
__local LinearBVHInteriorNode* interior_nodes;
__local LinearBVHLeafNode* leaf_nodes;
int interior_base_idx, leaf_base_idx;
__global BVHTree* tree;
} local_BVHTree;
typedef结构{
AABB左界,右界;
智力儿童[2];
ushort分裂轴;
int pad;
}linearbvh内部节点;
类型定义结构{
int父代;
int leaf_lowidx,leaf_highidx;
int last_内部;
}LinearBVHInteriorNodeInfo;
类型定义结构{
int-lowIdx,highIdx;
}线形叶节;
类型定义结构{
int父代;
}LinearBVHLeafNodeInfo;
类型定义结构{
__全局线性RBVHInteriorNode*内部节点;//!<指向内部节点的指针
__全局线性RBVHInteriorNodeInfo*内部节点信息;//!<指向内部节点信息的指针
__全局线性RBVHleafNode*叶节点;//!<指向叶节点的指针
__全局线性RBVHleafNodeInfo*叶节点信息;//!<指向叶节点信息的指针
uint n_内部节点;//!<内部节点数
uint n_叶节点;//!<叶节点数
}BVHTree;
类型定义结构{
__局部线性RBVHInteriorNode*内部节点;
__局部线性RBVHleafNode*叶节点;
int interior_base_idx、leaf_base_idx;
__全局BVHTree*树;
}本地树;
我在opencl内核中寻找像
memcpy
这样的函数。因为两个数组具有相同的结构,并且可以使用这样的指令进行复制。我认为async\u work\u group\u copy
也在做类似的事情。有人遇到过同样的问题吗?本地内存中的数据使用什么?您真的需要执行完整复制吗?我猜编译器可能意识到您并没有真正使用所有复制的数据,而是在避免复制。而在另一种方法中,由于“块复制”的限制,异步没有进行优化。@DarkZeros我试图复制边界卷层次结构的一部分,并且使用了它的大部分。此外,编译器无法猜测树的哪一部分将被使用或不被使用,因此它可以意识到不应该复制它。@DarkZeros同样,您认为复制32 KB的GPU需要100毫秒吗?我的代码中应该有错误。在我的例子中,async\u work\u group\u copy
使内核在CPU上运行时比手动复制慢几倍。我在nvidia gpu上没有注意到这个问题。@doqtor您尝试过AMD gpu吗?我使用的是amd r9 280x。在CPU上,这是有意义的,因为CPU根本没有本地内存,但对于gpu,它应该会更快。您在本地内存中使用什么数据?您真的需要执行完整复制吗?我猜编译器可能意识到您并没有真正使用所有复制的数据,而是在避免复制。而在另一种方法中,由于“块复制”的限制,异步没有进行优化。@DarkZeros我试图复制边界卷层次结构的一部分,并且使用了它的大部分。此外,编译器无法猜测树的哪一部分将被使用或不被使用,因此它可以意识到不应该复制它。@DarkZeros同样,您认为复制32 KB的GPU需要100毫秒吗?我的代码中应该有错误。在我的例子中,async\u work\u group\u copy
使内核在CPU上运行时比手动复制慢几倍。我在nvidia gpu上没有注意到这个问题。@doqtor您尝试过AMD gpu吗?我使用的是amd r9 280x。在CPU上,这是有意义的,因为CPU根本没有本地内存,但对于gpu,它应该更快。