CUDA还原最小值和索引

CUDA还原最小值和索引,cuda,nvidia,reduction,Cuda,Nvidia,Reduction,我使用cuda8实现了一个最小的reduce,方法是遵循伟大的解释并对其进行修改 __inline__ __device__ int warpReduceMin(int val) { for (int offset = warpSize / 2; offset > 0; offset /= 2) { int tmpVal = __shfl_down(val, offset); if (tmpVal < val) {

我使用
cuda8
实现了一个最小的reduce,方法是遵循伟大的解释并对其进行修改

__inline__ __device__ int warpReduceMin(int val) 
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            val = tmpVal;
        }
    }
    return val;
}

__inline__ __device__ int blockReduceMin(int val) 
{

    static __shared__ int shared[32]; // Shared mem for 32 partial mins
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;

    val = warpReduceMin(val);     // Each warp performs partial reduction

    if (lane == 0)
    {
        shared[wid] = val; // Write reduced value to shared memory
    }

    __syncthreads();              // Wait for all partial reductions

    //read from shared memory only if that warp existed
    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : INT_MAX;

    if (wid == 0)
    {
        val = warpReduceMin(val); //Final reduce within first warp
    }

    return val;
}

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        minVal = min(minVal, in[i]);
    }
    minVal = blockReduceMin(minVal);
    if (threadIdx.x == 0)
    {
        atomicMin(out, minVal);
    }
}
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
对于(int offset=warpSize/2;offset>0;offset/=2)
{
int tmpVal=uu shfl_udown(val,offset);
if(tmpVal
它工作得很好,我得到了最小值。但是,我不关心最小值,只关心它在原始输入数组中的索引

我试着修改一下我的代码

__inline__ __device__ int warpReduceMin(int val, int* idx) // Adding output idx
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
    {
        int tmpVal = __shfl_down(val, offset);
        if (tmpVal < val)
        {
            *idx = blockIdx.x * blockDim.x + threadIdx.x + offset; // I guess I'm missing something here
            val = tmpVal;
        }
    }
    return val;
}

...
blockReduceMin stayed the same only adding idx to function calls
...

__global__ void deviceReduceBlockAtomicKernel(int *in, int* out, int N) {
    int minVal = INT_MAX;
    int minIdx = 0; // Added this
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
        i < N;
        i += blockDim.x * gridDim.x) 
    {
        if (in[i] < minVal)
        {
            minVal = in[i];
            minIdx = i; // Added this
        }
    }
    minVal = blockReduceMin(minVal, &minIdx);
    if (threadIdx.x == 0)
    {
        int old = atomicMin(out, minVal);
        if (old != minVal) // value was updated
        {
            atomicExch(out + 1, minIdx);
        }
    }
}
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
对于(int offset=warpSize/2;offset>0;offset/=2)
{
int tmpVal=uu shfl_udown(val,offset);
if(tmpVal

但它不起作用。我觉得我遗漏了一些重要的东西,而且这不是解决问题的方法,但我的搜索没有结果。

这里有几个问题。您需要修改“扭曲”和“块最小值”函数,以便在每次找到新的局部最小值时传播最小值及其索引。也许是这样的:

__inline__ __device__ void warpReduceMin(int& val, int& idx)
{
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        int tmpVal = __shfl_down(val, offset);
        int tmpIdx = __shfl_down(idx, offset);
        if (tmpVal < val) {
            val = tmpVal;
            idx = tmpIdx;
        }
    }
}

__inline__ __device__  void blockReduceMin(int& val, int& idx) 
{

    static __shared__ int values[32], indices[32]; // Shared mem for 32 partial mins
    int lane = threadIdx.x % warpSize;
    int wid = threadIdx.x / warpSize;

    warpReduceMin(val, idx);     // Each warp performs partial reduction

    if (lane == 0) {
        values[wid] = val; // Write reduced value to shared memory
        indices[wid] = idx; // Write reduced value to shared memory
    }

    __syncthreads();              // Wait for all partial reductions

    //read from shared memory only if that warp existed
    if (threadIdx.x < blockDim.x / warpSize) {
        val = values[lane];
        idx = indices[lane];
    } else {
        val = INT_MAX;
        idx = 0;
    }

    if (wid == 0) {
         warpReduceMin(val, idx); //Final reduce within first warp
    }
}
它坏了。不能保证在此代码中正确设置最小值及其索引。这是因为不能保证两个原子操作都有任何同步,并且存在一个潜在的竞争,即一个块可以正确地覆盖另一个块的最小值,但其索引会被替换的块覆盖。这里唯一的解决方案是某种互斥,或者对每个块的结果运行第二个缩减内核。

可能会有兴趣
int old = atomicMin(out, minVal);
if (old != minVal) // value was updated
{
    atomicExch(out + 1, minIdx);
}