分区子群的Cuda实现_Cuda - Fatal编程技术网

分区子群的Cuda实现

cuda

分区子群的Cuda实现,cuda,Cuda,有没有更有效的方法来实现Vulkan/OpenGL的“分区子组”功能，这些功能不必循环子组中的所有元素？我当前的实现只是使用一个从0到扭曲大小的循环参考资料：（幻灯片37+38）简单实现： __device__ uint32_t subgroupPartitionNV(ivec2 p) { uint32_t result = 0; for (int i = 0; i < 32; ++i) { int x = __shfl_sync(0xFFF

有没有更有效的方法来实现Vulkan/OpenGL的“分区子组”功能，这些功能不必循环子组中的所有元素？我当前的实现只是使用一个从0到扭曲大小的循环

参考资料：

（幻灯片37+38）

简单实现：

__device__ uint32_t subgroupPartitionNV(ivec2 p)
{
    uint32_t result = 0;
    for (int i = 0; i < 32; ++i)
    {
        int x = __shfl_sync(0xFFFFFFFF, p(0), i);
        int y = __shfl_sync(0xFFFFFFFF, p(1), i);

        uint32_t b = __ballot_sync(0xFFFFFFFF, p(0) == x && p(1) == y);
        if (i == threadIdx.x & 31) result = b;
    }
    return result;
}

__device__ uint32_t subgroupPartitionedAddNV(float value, uint32_t ballot)
{
    float result = 0;
    for ( unsigned int i = 0; i < 32; ++i)
    {
        float other_value = __shfl_sync(0xFFFFFFFF, value, i);
        if ((1U << i)  & ballot) result += other_value;
    }
    return result;
}

\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuu32\uuuu t子分区NV（ivec2 p）
{
uint32_t结果=0；
对于（int i=0；i<32；++i）
{
int x=uuu shfl_sync（0xFFFFFFFF，p（0），i）；
int y=uu shfl_sync（0xFFFFFFFF，p（1），i）；
uint32\u t b=\u选票\u同步（0xFFFFFFFF，p（0）=x和&p（1）=y）；
如果（i==threadIdx.x&31）结果=b；
}
返回结果；
}
__设备uint32子分区ADDNV（浮动值，uint32投票）
{
浮动结果=0；
for（无符号整数i=0；i<32；++i）
{
浮动其他值=\uuuushfl\usync（0xFFFFFFFF，值，i）；
if（（1U多亏了Abator的提示，我想出了一个更有效的解决方案。这有点难看，因为标记的分区
只针对int
实现，但效果相当好
template <int GROUP_SIZE = 32>
__device__ cooperative_groups::coalesced_group subgroupPartitionNV(ivec2 p)
{
    using namespace cooperative_groups;
    thread_block block                   = this_thread_block();
    thread_block_tile<GROUP_SIZE> tile32 = tiled_partition<GROUP_SIZE>(block);

    coalesced_group g1 = labeled_partition(tile32, p(0));
    coalesced_group g2 = labeled_partition(tile32, p(1));

    details::_coalesced_group_data_access acc;
    return acc.construct_from_mask<coalesced_group>(acc.get_mask(g1) & acc.get_mask(g2));
}


template <typename T, int GROUP_SIZE = 32>
__device__ T subgroupPartitionedAddNV(T value, cooperative_groups::coalesced_group group)
{
    int s = group.size();
    int r = group.thread_rank();

    for (int offset = GROUP_SIZE / 2; offset > 0; offset /= 2)
    {
        auto v = group.template shfl_down(value, offset);
        if (r + offset < s) value += v;
    }
    return value;
}

模板
__设备协作组：：合并组子分区NV（ivec2 p）
{
使用命名空间协作组；
thread_block block=此_thread_block（）；
线程\u块\u平铺32=平铺分区（块）；
聚结_群g1=标记的_划分（tile32，p（0））；
聚结_群g2=标记的_划分（tile32，p（1））；
详细信息：：_合并_组_数据_访问acc；
从_掩码（acc.get_掩码（g1）和acc.get_掩码（g2））返回acc.construct_；
}
样板
__设备分区ADDNV（T值，协作组：：合并组）
{
int s=group.size（）；
int r=group.thread_rank（）；
对于（整数偏移=组大小/2；偏移>0；偏移/=2）
{
自动v=组模板shfl_向下（值，偏移）；
如果（r+偏移
您是否尝试过协作组：：标记分区
？