CUDA内核在CudaDeviceSynchronize之前未启动_Cuda

CUDA内核在CudaDeviceSynchronize之前未启动

cuda

CUDA内核在CudaDeviceSynchronize之前未启动,cuda,Cuda,我和CUDA有点问题。请看附件中的图片。内核在0.395秒的标记点启动。然后是一些绿色的彩画。最后，调用cudaDeviceSynchronize。在CpuWork之前启动的内核不会在同步调用之前启动。理想情况下，它应该与CPU工作并行运行 void KdTreeGpu::traceRaysOnGpuAsync(int firstRayIndex, int numRays, int rank, int buffer) { int per_block = 128; int num_

我和CUDA有点问题。请看附件中的图片。内核在0.395秒的标记点启动。然后是一些绿色的彩画。最后，调用cudaDeviceSynchronize。在CpuWork之前启动的内核不会在同步调用之前启动。理想情况下，它应该与CPU工作并行运行

void KdTreeGpu::traceRaysOnGpuAsync(int firstRayIndex, int numRays, int rank, int buffer)
{
    int per_block = 128;
    int num_blocks = numRays/per_block + (numRays%per_block==0?0:1);

    Ray* rays = &this->deviceRayPtr[firstRayIndex];
    int* outputHitPanelIds = &this->deviceHitPanelIdPtr[firstRayIndex];

    kdTreeTraversal<<<num_blocks, per_block, 0>>>(sceneBoundingBox, rays, deviceNodesPtr, deviceTrianglesListPtr, 
                                                firstRayIndex, numRays, rank, rootNodeIndex, 
                                                deviceTHitPtr, outputHitPanelIds, deviceReflectionPtr);

    CUDA_VALIDATE(cudaMemcpyAsync(resultHitDistances[buffer], deviceTHitPtr, numRays*sizeof(double), cudaMemcpyDeviceToHost));
    CUDA_VALIDATE(cudaMemcpyAsync(resultHitPanelIds[buffer], outputHitPanelIds, numRays*sizeof(int), cudaMemcpyDeviceToHost));
    CUDA_VALIDATE(cudaMemcpyAsync(resultReflections[buffer], deviceReflectionPtr, numRays*sizeof(Vector3), cudaMemcpyDeviceToHost));
}

希望能解决这个问题。已经尝试了很多事情，包括不在默认流中运行。当我添加cudaHostAlloc时，我意识到异步方法返回到了CPU。但是当内核在稍后的deviceSynchronize调用之前没有启动时，这没有帮助

resultThitInstances[2]

包含两个分配的内存区域，因此当CPU读取0时，GPU应将结果放入1

谢谢

编辑：这是调用traceRaysAsync的代码

int numIterations = ceil(float(this->numPrimaryRays) / MAX_RAYS_PER_ITERATION);
int numRaysPrevious = min(MAX_RAYS_PER_ITERATION, this->numPrimaryRays);
nvtxRangePushA("traceRaysOnGpuAsync First");
traceRaysOnGpuAsync(0, numRaysPrevious, rank, 0);
nvtxRangePop();

for(int iteration = 0; iteration < numIterations; iteration++)
{

    int rayFrom = (iteration+1)*MAX_RAYS_PER_ITERATION;
    int rayTo = min((iteration+2)*MAX_RAYS_PER_ITERATION, this->numPrimaryRays) - 1;
    int numRaysIteration = rayTo-rayFrom+1;

    // Wait for results to finish and get them

    waitForGpu();
    // Trace the next iteration asynchronously. This will have data prepared for next iteration

    if(numRaysIteration > 0)
    {
        int nextBuffer = (iteration+1) % 2;
        nvtxRangePushA("traceRaysOnGpuAsync Interior");
        traceRaysOnGpuAsync(rayFrom, numRaysIteration, rank, nextBuffer);
        nvtxRangePop();
    }
    nvtxRangePushA("CpuWork");

    // Store results for current iteration

    int rayOffset = iteration*MAX_RAYS_PER_ITERATION;
    int buffer = iteration % 2;

    for(int i = 0; i < numRaysPrevious; i++)
    {
        if(this->activeRays[rayOffset+i] && resultHitPanelIds[buffer][i] >= 0)
        {
            this->activeRays[rayOffset+i] = false;
            const TrianglePanelPair & t = this->getTriangle(resultHitPanelIds[buffer][i]);
            double hitT = resultHitDistances[buffer][i];

            Vector3 reflectedDirection = resultReflections[buffer][i];

            Result res = Result(rays[rayOffset+i], hitT, t.panel);
            results[rank].push_back(res);
            t.panel->incrementIntensity(1.0);

            if (t.panel->getParent().absorbtion < 1)
            {
                numberOfRaysGenerated++;

                Ray reflected (res.endPoint() + 0.00001*reflectedDirection, reflectedDirection);

                this->newRays[rayOffset+i] = reflected;
                this->activeRays[rayOffset+i] = true;
                numNewRays++;

            }
        }



    }

    numRaysPrevious = numRaysIteration;

    nvtxRangePop();

}

int numIterations=ceil（浮动（此->numprimaryrayes）/MAX_RAYS每迭代一次）；
int numRaysPrevious=min（每次迭代的最大光线数，此->numPrimaryRays）；
nvtxRangePushA（“traceRaysOnGpuAsync优先”）；
traceraysongpuanc（0，numRaysPrevious，rank，0）；
nvtxRangePop（）；
for（int iteration=0；iterationnumPrimaryRays）-1；
int numraysiration=光线到光线自+1；
//等待结果完成并获取它们
waitpu（）；
//异步跟踪下一次迭代。这将为下一次迭代准备数据
如果（数值迭代>0）
{
int nextBuffer=（迭代+1）%2；
nvtxRangePushA（“traceRaysOnGpuAsync内饰”）；
traceraysongpuanc（rayFrom、numRaysIteration、rank、nextBuffer）；
nvtxRangePop（）；
}
nvtxRangePushA（“CpuWork”）；
//存储当前迭代的结果
int raydoffset=迭代*每次迭代的最大光线数；
int buffer=迭代%2；
对于（int i=0；iactiveRays[raydoffset+i]&&resulthtpanelds[buffer][i]>=0）
{
此->activeRays[rayOffset+i]=false；
const TrianglePanelPair&t=this->getTriangle（resultThitPanelids[buffer][i]）；
double hitT=结果状态[缓冲区][i]；
Vector3 reflectedDirection=resultReflections[buffer][i]；
结果res=结果（光线[光线偏移量+i]，hitT，t面板）；
结果[排名]。推回（res）；
t、 面板->递增强度（1.0）；
如果（t.panel->getParent（）.absorbtion<1）
{
生成的光线数++；
光线反射（res.endPoint（）+0.00001*反射方向，反射方向）；
这->新光线[rayOffset+i]=反射；
此->activeRays[rayOffset+i]=true；
numNewRays++；
}
}
}
numRaysPrevious=numRaysIteration；
nvtxRangePop（）；
}

这是WDDM驱动程序模型的Windows上的预期行为，驱动程序试图通过尝试批处理内核启动来减轻内核启动开销。尝试在内核调用之后直接插入

cudaStreamQuery（0）

，以在批处理已满之前触发内核的提前启动。

在KdTreeGpu:：traceRaysOnGpuAsync调用之后没有显示代码，但这可能很有用，例如，可以查看在何处以及为什么使用cudaDeviceSynchronize（）调用？我认为您是在调用KdTreeGpu:：traceraysongpuancy之后立即发出devicesync，但这将消除重叠。这是您希望重叠的区域，假设第二个绿色的CpuWork条不依赖于kdTreeTraversal的结果，那么您希望在内核函数调用后立即移动或消除该devicesync。在devicesync之前重新考虑一些CpuWork。我添加了更多的代码，这些代码已经清除了一些计时器，因此应该更容易理解。这两个缓冲区应该使CPU独立于内核启动。为了避免与WDM驱动程序模型的性能问题，考虑切换到TCC驱动程序。一个内核之后和两个MeMeCon之后的一个操作。在内核之后只有一个内存，memcpy被延迟到syncrhonize。现在，它是在适当的平行。谢谢

int numIterations = ceil(float(this->numPrimaryRays) / MAX_RAYS_PER_ITERATION);
int numRaysPrevious = min(MAX_RAYS_PER_ITERATION, this->numPrimaryRays);
nvtxRangePushA("traceRaysOnGpuAsync First");
traceRaysOnGpuAsync(0, numRaysPrevious, rank, 0);
nvtxRangePop();

for(int iteration = 0; iteration < numIterations; iteration++)
{

    int rayFrom = (iteration+1)*MAX_RAYS_PER_ITERATION;
    int rayTo = min((iteration+2)*MAX_RAYS_PER_ITERATION, this->numPrimaryRays) - 1;
    int numRaysIteration = rayTo-rayFrom+1;

    // Wait for results to finish and get them

    waitForGpu();
    // Trace the next iteration asynchronously. This will have data prepared for next iteration

    if(numRaysIteration > 0)
    {
        int nextBuffer = (iteration+1) % 2;
        nvtxRangePushA("traceRaysOnGpuAsync Interior");
        traceRaysOnGpuAsync(rayFrom, numRaysIteration, rank, nextBuffer);
        nvtxRangePop();
    }
    nvtxRangePushA("CpuWork");

    // Store results for current iteration

    int rayOffset = iteration*MAX_RAYS_PER_ITERATION;
    int buffer = iteration % 2;

    for(int i = 0; i < numRaysPrevious; i++)
    {
        if(this->activeRays[rayOffset+i] && resultHitPanelIds[buffer][i] >= 0)
        {
            this->activeRays[rayOffset+i] = false;
            const TrianglePanelPair & t = this->getTriangle(resultHitPanelIds[buffer][i]);
            double hitT = resultHitDistances[buffer][i];

            Vector3 reflectedDirection = resultReflections[buffer][i];

            Result res = Result(rays[rayOffset+i], hitT, t.panel);
            results[rank].push_back(res);
            t.panel->incrementIntensity(1.0);

            if (t.panel->getParent().absorbtion < 1)
            {
                numberOfRaysGenerated++;

                Ray reflected (res.endPoint() + 0.00001*reflectedDirection, reflectedDirection);

                this->newRays[rayOffset+i] = reflected;
                this->activeRays[rayOffset+i] = true;
                numNewRays++;

            }
        }



    }

    numRaysPrevious = numRaysIteration;

    nvtxRangePop();

}