Cuda \uuuSyncThreads()在此代码中是否正常工作?

Cuda \uuuSyncThreads()在此代码中是否正常工作?,cuda,synchronization,Cuda,Synchronization,编辑: 添加了更多日志信息以及推送和弹出相关功能 设置:Geforce GT520,Windows 64位(编译为32位),Cuda 4.2 在我的代码片段中,我有一些工作任务,它们为线程提供工作。每个块都有自己的deque,可以在其底部推送或弹出动态生成的工作项(popWork和pushWork函数)。如果popWork()自己的deque中的工作项数小于阈值,则popWork()还可以从其他deque窃取工作 template <class TreeNode, class BV ,

编辑:

添加了更多日志信息以及推送和弹出相关功能



设置:Geforce GT520,Windows 64位(编译为32位),Cuda 4.2

在我的代码片段中,我有一些工作任务,它们为线程提供工作。每个块都有自己的deque,可以在其底部推送或弹出动态生成的工作项(popWork和pushWork函数)。如果popWork()自己的deque中的工作项数小于阈值,则popWork()还可以从其他deque窃取工作

template <class TreeNode, class BV , int iDequeSize , int iFrontSize>
__global__ void traverseTree(const TreeNode* tree_object1, const GPUVertex* vertex_object1, const uint3* tri_object1,
int2* aBvttDeques, int* aiBvttBottoms, unsigned int* auiBvttAges, int *aiBvttDequeFlags , int *piOverflowFlag , 
int2* outputList, unsigned int* outputListIdx , int2* aFrontDeques , int *auiFrontBottoms)
{   
int iTid = threadIdx.x;
int iBid = blockIdx.x;

__shared__ int2 aLocalBvtt[BVTT_DEQUE_SHARED_SIZE];
__shared__ int2 aLocalFront[FRONT_DEQUE_SHARED_SIZE];
__shared__ int iLocalBvttCounter[WORK_STEALING_THREADS];
__shared__ int iLocalFrontCounter[WORK_STEALING_THREADS];
__shared__ unsigned int uiPopDequeIdx;
__shared__ int2 pushOrPopStartIdxAndSize;
__shared__ bool bPopOrPushFlag;
__shared__ unsigned int uiActiveDeques[NDEQUES]; //Contains indices for deques with useful work that can be stolen
__shared__ unsigned int uiActiveDequesIdx;

//Debug
int iRun = 0;
//

while(/*true*/ /*Debug*/iRun++ < 10) //Work loop will continue until cannot pop from bottom or cannot steal work from other deques
{
    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) before sync0\n" , iBid , iTid);
    }

    __syncthreads();

    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) after sync0\n" , iBid , iTid);
    }
    //

    int2 aWork_items[3];
    int2 aFront_item;
    int iBvttCount = 0;
    int iFrontCount = 0;

    if(!popWork<int2 , iDequeSize>(aiBvttDequeFlags , aiBvttBottoms , auiBvttAges , aBvttDeques , iTid , iBid ,
        uiPopDequeIdx , pushOrPopStartIdxAndSize , /*iLocalBvttCounter , iLocalFrontCounter ,*/ bPopOrPushFlag ,
        uiActiveDeques , uiActiveDequesIdx , aWork_items[0]))
    {   //No more work

        //Debug
        if(iTid == 0)
        {
            printf("(%d,%d)no work\n" , iBid , iTid);
        }
        //

        return; 
    }

    //Debug
    if(iTid == 0 && iBid == 0)
    {
        printf("(%d,%d) run=%d work=(%d,%d)\n" , iBid , iTid , iRun , aWork_items[0].x , aWork_items[0].y);
    }
    //

    if(iTid < pushOrPopStartIdxAndSize.y)
    {
        TreeNode node1      = tree_object1[aWork_items[0].x];
        TreeNode node2      = tree_object1[aWork_items[0].y];

        if(aWork_items[0].x == aWork_items[0].y)
        {   //intra-collision test (self collision)
            intraCollision<TreeNode , BV>(iBvttCount , aWork_items , aWork_items[0] , node1 , tree_object1);
        }
        else
        {   //inter-collision test
            //interCollision<TreeNode , BV>(iLocalBvttCounter , aLocalBvtt , outputList , outputListIdx , work_item ,
            //  node1 , node2 , tree_object1 , tri_object1 , aFrontDeques , auiFrontBottoms[iBid]);

            interCollision<TreeNode , BV>(iBvttCount , aWork_items , outputList , outputListIdx , aWork_items[0] ,
                node1 , node2 , tree_object1 , tri_object1 , aFront_item , iFrontCount);
        }
        //__syncthreads();
    }
    iLocalBvttCounter[iTid] = iBvttCount;
    iLocalFrontCounter[iTid] = iFrontCount;

    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) before sync1\n" , iBid , iTid);
    }
    //

    __syncthreads();

    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) after sync1\n" , iBid , iTid);
    }
    //

    {
        int iPrefixSum = prefixSum<WORK_STEALING_THREADS>(iLocalFrontCounter , iTid , iFrontCount);
        if(iFrontCount)
        {
            aLocalFront[iPrefixSum] = aFront_item;
        }
        if(iTid == WORK_STEALING_THREADS - 1)
        {
            iLocalFrontCounter[iTid] = iPrefixSum + iFrontCount;
        }

        iPrefixSum = prefixSum<WORK_STEALING_THREADS>(iLocalBvttCounter , iTid , iBvttCount);

        //Debug
        if(iTid == 0 && iBid == 0)
        {
            printf("(%d,%d) nChildren=%d prefixSum=%d\n" , iBid , iTid , iBvttCount , iPrefixSum);
            for(int i = 0 ; i < iBvttCount ; ++i)
            {
                printf("(%d,%d) children %d=(%d,%d)\n" , iBid , iTid , i , aWork_items[i].x , aWork_items[i].y);
            }
        }
        if(iTid == 1 && iBid == 0)
        {
            printf("(%d,%d) nprefixSumt2=%d\n" , iBid , iTid , iPrefixSum);
        }
        //

        for(int i = 0 ; i < iBvttCount ; ++i)
        {
            aLocalBvtt[iPrefixSum + i] = aWork_items[i];
        }
        if(iTid == WORK_STEALING_THREADS - 1)
        {
            iLocalBvttCounter[iTid] = iPrefixSum + iBvttCount;

            //Debug
            if(iBid == 0)
            {
                printf("(%d,%d) totalWork=%d, prefix=%d + count=%d\n" , iBid , iTid , iLocalBvttCounter[iTid] , iPrefixSum , iBvttCount);
            }
            //
        }
    }

    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) before sync2\n" , iBid , iTid);
    }
    //

    __syncthreads();

    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) after sync2\n" , iBid , iTid);
    }
    //

    //TODO: push work back only when memory size is good for better coallescence
    //Push back front to global mem
    if(!pushWork<int2 , iFrontSize>(auiFrontBottoms , aFrontDeques , iTid , iBid ,
        iLocalFrontCounter[WORK_STEALING_THREADS - 1] , aLocalFront , bPopOrPushFlag , pushOrPopStartIdxAndSize))
    {   //overflow
        if(iTid == 0)
        {
            //Debug
            printf("(%d,%d) front overflow\n" , iBid , iTid);
            //
            *piOverflowFlag = 1;
            atomicExch(&aiBvttDequeFlags[iBid] , 0);
        }
        return;
    }

    //Debug
    if(iTid == 0 && iBid == 0)
    {
        printf("(%d,%d) localnWork=%d\n" , iBid , iTid , iLocalBvttCounter[WORK_STEALING_THREADS - 1]);
    }
    //

    //Push back BVTT nodes to global mem
    if(!pushWork<int2 , iDequeSize>(aiBvttBottoms , aBvttDeques , iTid , iBid , iLocalBvttCounter[WORK_STEALING_THREADS - 1] ,
        aLocalBvtt , bPopOrPushFlag , pushOrPopStartIdxAndSize))
    {   //overflow
        if(iTid == 0)
        {
            //Debug
            printf("(%d,%d) bvtt overflow\n" , iBid , iTid);
            //

            *piOverflowFlag = 1;
            atomicExch(&aiBvttDequeFlags[iBid] , 0);
        }
        return;
    }

    //Debug
    if(iTid == 0 && iBid == 0)
    {
        printf("(%d,%d) bot=%d\n\n" , iBid , iTid , aiBvttBottoms[iBid]);
    }
    //

    //Debug
    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) before sync4\n" , iBid , iTid);
    }

    __syncthreads();

    if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
    {
        printf("(%d,%d) after sync4\n" , iBid , iTid);
    }
    //
}

//Debug
if(iTid == 0)
{
    printf("(%d,%d) max iter\n" , iBid , iTid);
}
//
}

感谢您的关注。

提前返回的
破坏了这里的一致性。从技术上讲,从那时起,所有的
\uu syncthreads()
都在条件代码中,因此具有未定义的行为。注意,即使从未执行过,这也是正确的,因为它仍然会影响编译器对重新聚合点的选择

如果某些线程没有工作,您仍然必须让它们保持活动状态,才能在
\uu syncthreads()
中进行协作


您可以通过消除返回并设置每个线程的“跳过代码”标志来测试这一理论。除了uu syncthreads()之外的所有代码都应该在
if(!skip_flag){…}
条件中。我相信这会解决这个问题(除非有不相关的问题)


或者,您可以使用
cuobjdump-sass
深入目标代码,并通过查看
SSY
说明来检查重新聚集点的位置。我相信您会发现同步点已经过了下一个
\uuu syncthreads()
,可能一直到内核结束。

早期的
返回破坏了这里的一致性。从技术上讲,从那时起,所有的
\uu syncthreads()
都在条件代码中,因此具有未定义的行为。注意,即使从未执行过,这也是正确的,因为它仍然会影响编译器对重新聚合点的选择

如果某些线程没有工作,您仍然必须让它们保持活动状态,才能在
\uu syncthreads()
中进行协作


您可以通过消除返回并设置每个线程的“跳过代码”标志来测试这一理论。除了uu syncthreads()之外的所有代码都应该在
if(!skip_flag){…}
条件中。我相信这会解决这个问题(除非有不相关的问题)


或者,您可以使用
cuobjdump-sass
深入目标代码,并通过查看
SSY
说明来检查重新聚集点的位置。我相信您会发现同步点已经超过下一个
\uu syncthreads()
,可能一直到内核结束。

只是一个观察:您的内核中有多个同步点


可能是因为循环中有多个_syncthreads()。线程以不同的速度执行,并在不同的同步点进行同步?有些在第一个同步点等待,有些在下一个同步点等待。因此,一切都崩溃了。在循环结束时同步可以解决此问题

只是一个观察:内核中有多个同步点


可能是因为循环中有多个_syncthreads()。线程以不同的速度执行,并在不同的同步点进行同步?有些在第一个同步点等待,有些在下一个同步点等待。因此,一切都崩溃了。在循环结束时同步可以解决此问题

Geforce GT520,Windows 64位(编译为32位),Cuda 4.2。在while开始时使用另一个u syncthreads()如何?已经尝试过了,但没有效果。Geforce GT520,Windows 64位(编译为32位),Cuda 4.2。使用另一个u syncthreads()如何在while开始的时候?已经尝试过了,但是没有效果。即使popWork()标记每个块而不是每个线程没有工作,也是这样吗?我将发布popWork()代码,以便您了解pop是如何完成的。添加了更详细的日志。现在我可以看到线程(0,95)正在绕过同步点,但我还不知道为什么。是的,我相信如果条件返回在整个块上统一执行,或者即使它根本没有执行,情况仍然如此,因为它改变了编译器对再收敛点的选择。因此,目标代码中的条件代码部分可能比源代码中的条件代码部分扩展得更远(在compute capability 2.0或更高版本上,可能一直延伸到内核的末尾),因此包含了被认为是无条件的
\u syncthreads()
。我承认这可能违反了记录在案的行为,但我相信事实就是如此。如果能从一位内情稍深的Nvidia员工那里听到一句话,那就太好了。我相信这一直是一个阴暗的领域,我想从Nvidia的CUDA论坛中挖掘相关帖子,但不幸的是,它们仍然处于关闭状态。即使popWork()在每个块而不是每个线程中标记出停止工作,情况是否也是如此?我将发布popWork()代码,以便您了解pop是如何完成的。添加了更详细的日志。现在我可以看到线程(0,95)正在绕过同步点,但我还不知道为什么。是的,我相信如果条件返回在整个块上统一执行,或者即使它根本没有执行,情况仍然如此,因为它改变了编译器对再收敛点的选择。因此,目标代码中的条件代码部分可能比源代码中的条件代码部分扩展得更远(在compute capability 2.0或更高版本上,可能一直延伸到内核的末尾),因此包含了被认为是无条件的
\u syncthreads()
。我承认这可能违反了记录在案的行为,但我相信事实就是如此。如果能从一位内情稍深的Nvidia员工那里听到一句话,那就太好了。我相信这一直是一个阴暗的领域,我想从Nvidia的CUDA论坛中挖掘相关帖子,但不幸的是
bool __inline__ __device__ popTop(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int index;
unsigned int oldAge = auiDequesAges[iBid];
int localBot = aiDequesBottoms[iBid];
index = oldAge >> WORK_STEALING_TAG_NBITS;
if(localBot < index + WORK_STEALING_POP_SIZE + WORK_STEALING_PUSH_SIZE)
{
    return false;
}

int localTag = oldAge & WORK_STEALING_TAG_MASK;
int size = min(WORK_STEALING_POP_SIZE , localBot - index);
unsigned int newAge = (index+size << WORK_STEALING_TAG_NBITS)| localTag;

if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
{
    popStartIdxAndSize.x = index;
    popStartIdxAndSize.y = size;
    return true;
}
else
{
    return false;
}
}

bool __inline__ __device__ popBottom(int *aiDequesBottoms , unsigned int *auiDequesAges , const int &iBid ,
int2 &popStartIdxAndSize)
{
int localBot = aiDequesBottoms[iBid];
if(localBot == 0)
{
    return false;
}

int index = localBot;
localBot = localBot - WORK_STEALING_POP_SIZE;
atomicExch(&aiDequesBottoms[iBid] , localBot);

unsigned int oldAge = auiDequesAges[iBid];
int oldAgeTop = int(oldAge >> WORK_STEALING_TAG_NBITS);
if(localBot > oldAgeTop)
{
    popStartIdxAndSize.y = WORK_STEALING_POP_SIZE;
    popStartIdxAndSize.x = index - WORK_STEALING_POP_SIZE;
    return true;
}

atomicExch(&aiDequesBottoms[iBid] , 0);
unsigned int newAge = ((oldAge & WORK_STEALING_TAG_MASK) + 1) % (WORK_STEALING_TAG_MASK + 1);
if(index > oldAgeTop)
{
    if(oldAge == atomicCAS(&auiDequesAges[iBid] , oldAge , newAge))
    {
        popStartIdxAndSize.y = index - oldAgeTop;
        popStartIdxAndSize.x = index - popStartIdxAndSize.y;
        return true;
    }
}

atomicExch(&auiDequesAges[iBid] , newAge);
return false;
}

template <typename Work , int iDequeSize>
bool __inline__ __device__ popWork(int *aiDequeFlags , int *aiDequesBottoms , unsigned int *auiDequesAges ,
const Work *aDeques , const int &iTid , const int &iBid , unsigned int &uiPopDequeIdx , int2 &popStartIdxAndSize ,
/*int &iLocalDequeCounter , int &iLocalFrontCounter ,*/ bool &bPopFlag , unsigned int *uiActiveDeques ,
unsigned int &uiActiveDequesIdx , Work &work)
{
if(iTid == 0)
{   //Try to pop from block deque
    //iLocalDequeCounter = 0;
    //iLocalFrontCounter = 0;

    bPopFlag = popBottom(aiDequesBottoms , auiDequesAges , iBid , popStartIdxAndSize);

    if(bPopFlag)
    {
        uiPopDequeIdx = iBid;
    }
    else
    {
        atomicExch(&aiDequeFlags[iBid] , 0);
    }
}
__syncthreads();

while(!bPopFlag)
{   //No more work, try to steal some (Help, police! We have a burglar here)!
    if(iTid == 0)
    {
        uiActiveDequesIdx = 0;

        //Debug
        /*if(iBid == 6 || iBid == 1)
        {
            printf("bid=%d dequeFlags:[%d,%d,%d,%d,%d,%d,%d,%d]\n" , iBid , aiDequeFlags[0] , aiDequeFlags[1] ,
                aiDequeFlags[2] , aiDequeFlags[3] , aiDequeFlags[4] , aiDequeFlags[5] , aiDequeFlags[6] ,
                aiDequeFlags[7]);

            printf("bId=%d dequesCounts:[%d,%d,%d,%d,%d,%d,%d,%d]\n" , iBid , aiDequesBottoms[0] ,
                aiDequesBottoms[1] , aiDequesBottoms[2] , aiDequesBottoms[3] , aiDequesBottoms[4] ,
                aiDequesBottoms[5] , aiDequesBottoms[6] , aiDequesBottoms[7]);
        }*/
        //
    }
    __syncthreads();

    if(iTid < NDEQUES)
    {
        if(aiDequeFlags[iTid] == 1) //assuming iTid >= NDEQUES
        {   //Set this deque for a work stealing atempt.
            unsigned int uiIdx = atomicAdd(&uiActiveDequesIdx,1);
            uiActiveDeques[uiIdx] = iTid;
        }
    }
    __syncthreads();

    if(iTid == 0)
    {   //Try to steal until succeeds or there are no more deques left to search
        bPopFlag = false;
        for(uiPopDequeIdx = 0 ; uiPopDequeIdx < uiActiveDequesIdx; ++uiPopDequeIdx)
        {
            bPopFlag = popTop(aiDequesBottoms , auiDequesAges , uiPopDequeIdx , popStartIdxAndSize);
            if(bPopFlag)
            {
                atomicExch(&aiDequeFlags[iBid] , 1);
                break;
            }
        }
    }
    __syncthreads();

    if(uiActiveDequesIdx == 0)
    { //No more work to steal. End.
        return false;
    }
    __syncthreads();
}

//Get poped data
if(iTid < popStartIdxAndSize.y) //assuming number of threads >= WORK_SIZE
{
    work = aDeques[uiPopDequeIdx*iDequeSize + popStartIdxAndSize.x + iTid];

    //Debug
    /*if(iTid == 20 && iBid == 0)
    {
        printf("work=(%d,%d) deque=%d dSize=%d start=%d final=%d\n" , work.x , work.y , uiPopDequeIdx , iDequeSize ,
            popStartIdxAndSize.x , uiPopDequeIdx*iDequeSize + popStartIdxAndSize.x + iTid);
    }*/
    //
}

return true;
}
template<int iDequeSize>
bool __inline__ __device__ pushBottom(int *aiDequeBottoms , const int &iBid , int2 &pushStartIdxAndSize)    
{
int iOldBot = aiDequeBottoms[iBid];
pushStartIdxAndSize.x = iOldBot;
iOldBot += pushStartIdxAndSize.y;

if(iOldBot < iDequeSize)
{
    atomicExch(&aiDequeBottoms[iBid] , iOldBot);
    return true;
}

return false;
}


template <typename Work , int iDequeSize>
bool __inline__ __device__ pushWork(int *aiDequesBottoms , Work *aDeques , const int &iTid , const int &iBid ,
const unsigned int &uiDequeOutputCounter , Work *aOutputLocalWork , bool &bPushFlag , int2 &pushStartIdxAndSize)
{
if(uiDequeOutputCounter == 0)
{
    return true;
}

pushStartIdxAndSize.y = uiDequeOutputCounter;

if(iTid == 0)
{
    bPushFlag = pushBottom<iDequeSize>(aiDequesBottoms , iBid , pushStartIdxAndSize);
}

//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
    printf("(%d,%d) before push sync\n" , iBid , iTid);
}
//
__syncthreads();

//Debug
if(iBid == 0 && (iTid == 0 || iTid == WORK_STEALING_THREADS - 1))
{
    printf("(%d,%d) after push sync\n" , iBid , iTid);
}
//

if(!bPushFlag)
{
    return false;
}

//Transfer to global mem.
//unsigned int uiWorkLeft = uiDequeOutputCounter;
unsigned int uiThreadOffset = iTid;

//Debug
//int iRun = 0;
//

while(uiThreadOffset < uiDequeOutputCounter)
{
    //Debug
    /*if(iTid == 0 && iBid == 0)
    {
        printf("workLeft=%d bot=%d\n" , uiWorkLeft , aiDequesBottoms[iBid]);
    }*/
    //
        //Debug
        /*if(iBid == 0)
        {
            printf("tid=%d run=%d pushStartIdx.x=%d final=%d\n" , iTid , iRun , pushStartIdxAndSize.x ,
                iDequeSize*iBid + pushStartIdxAndSize.x +  iTid);
        }*/
        //

        //aDeques[iDequeSize*iBid + pushStartIdxAndSize.x +  iTid] = aOutputLocalWork[uiThreadOffset];

    atomicExch(&(aDeques[iDequeSize*iBid + pushStartIdxAndSize.x + uiThreadOffset].x) , aOutputLocalWork[uiThreadOffset].x);
    atomicExch(&(aDeques[iDequeSize*iBid + pushStartIdxAndSize.x + uiThreadOffset].y) , aOutputLocalWork[uiThreadOffset].y);

    uiThreadOffset += blockDim.x;

    //Debug
    //++iRun;
    //

    //__threadfence();
}

return true;
}
(0,0) before sync0
(0,95) before sync0
(0,0) after sync0
(0,95) after sync0
(0,95) before sync1
(0,0) run=1 work=(0,0)
(0,0) before sync1
(0,0) after sync1
(0,95) after sync1
(0,0) nChildren=3 prefixSum=0
(0,95) totalWork=3, prefix=3 + count=0
(0,0) children 0=(1,2)
(0,95) before sync2
(0,0) children 1=(1,1)
(0,0) children 2=(2,2)
(0,1) nprefixSumt2=3
(0,0) before sync2
(0,0) after sync2
(0,95) after sync2
(0,0) localnWork=3
(0,95) before push sync
(0,0) before push sync
(0,0) after push sync
(0,95) after push sync
(0,95) before sync4
(0,95) after sync4 //PROBLEM HERE! (0,95) SHOULD STAY WAINTING IN SYNC4.
(0,0) bot=3

(0,0) before sync4
(0,95) before sync0 //NOW THREADS ARE SYNC ON DIFFERENT POINTS.
(0,0) after sync4
(0,95) after sync0