CUDA中混洗指令的延迟

CUDA中混洗指令的延迟,cuda,gpu,nvidia,Cuda,Gpu,Nvidia,关于\uuu shfl()指令的延迟: 是否遵循以下说明 c=__shfl(c, indi); /* where indi is any integer number(may be random (<32)), and is different for different LaneID. */ 所有扭曲混洗指令都具有相同的属性 >为罗伯特的回答提供一个“定量”的后续答案,让我们考虑M·夏里斯在C.S.U/P>详述的CUDA洗牌操作的还原方法。 在这种方法中,通过使用shfl\u do

关于
\uuu shfl()
指令的延迟:

是否遵循以下说明

c=__shfl(c, indi);

/*
where indi is any integer number(may be random (<32)), 
and is different for different LaneID.
*/

所有扭曲混洗指令都具有相同的属性

>为罗伯特的回答提供一个“定量”的后续答案,让我们考虑M·夏里斯在C.S.U/P>详述的CUDA洗牌操作的还原方法。 在这种方法中,通过使用
shfl\u down
执行翘曲减少。另一种减少翘曲的方法是根据以下要求使用
\uuuuushfl\uxor
。下面,我将报告实现这两种方法的完整代码。如果在开普勒K20c上进行测试,两者都需要
0.044ms
来减少
N=200000
浮点
元素的数组。相应地,这两种方法的性能都优于推力
减少
两个数量级,因为对于相同的测试,推力情况的执行时间为
1.06ms

以下是完整的代码:

#include <thrust\device_vector.h>

#define warpSize 32

/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {

    for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
    //for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);    
    return val;

}

/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {

    // --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
    //     1024, so that there will be at most 32 warps per each block.
    static __shared__ float shared[32]; 

    int lane    = threadIdx.x % warpSize;   // Thread index within the warp
    int wid     = threadIdx.x / warpSize;   // Warp ID

    // --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
    val = warpReduceSum(val);     

    // --- Only the threads with 0 index within the warp write the warp result to shared memory
    if (lane==0) shared[wid]=val;   // Write reduced value to shared memory

    // --- Wait for all warp reductions
    __syncthreads();              

    // --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only 
    //     the corresponding warp existed, otherwise the partial sum is set to zero.
    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

    // --- The first warp performs the final partial warp summation. 
    if (wid==0) val = warpReduceSum(val); 

    return val;
}

/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {

    float sum = 0.f;

    // --- Reduce multiple elements per thread.
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];

    sum = blockReduceSum(sum);

    if (threadIdx.x==0) out[blockIdx.x]=sum;
}

/********/
/* MAIN */
/********/
void main() {

    const int N = 200000;

    thrust::host_vector<float> h_out(N,0.f);

    thrust::device_vector<float> d_in(N,3.f);
    thrust::device_vector<float> d_out(N);

    int threads = 512;
    int blocks = min((N + threads - 1) / threads, 1024);

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // --- Performs the block reduction. It returns an output vector containig the block reductions as elements
    cudaEventRecord(start, 0);
    deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    // --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
    //     block reduction results of the previous step.
    deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("CUDA Shuffle - elapsed time:  %3.5f ms \n", time);      
    h_out = d_out;

    cudaEventRecord(start, 0);
    float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("CUDA Thrust - elapsed time:  %3.5f ms \n", time);       

    printf("Shuffle result = %f\n",h_out[0]);
    printf("Thrust result = %f\n",sum);

    getchar();

}
#包括
#定义尺寸32
/***********************************************/
/*warpReduceSum每经执行减少*/
/***********************************************/
__forceinline\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu{
对于(int offset=warpSize/2;offset>0;offset/=2)val+=\u shfl\u down(val,offset);
//对于(int i=1;i
#include <thrust\device_vector.h>

#define warpSize 32

/***********************************************/
/* warpReduceSum PERFORMING REDUCTION PER WARP */
/***********************************************/
__forceinline__ __device__ float warpReduceSum(float val) {

    for (int offset = warpSize/2; offset > 0; offset /= 2) val += __shfl_down(val, offset);
    //for (int i=1; i<warpSize; i*=2) val += __shfl_xor(val, i);    
    return val;

}

/*************************************************/
/* blockReduceSum PERFORMING REDUCTION PER BLOCK */
/*************************************************/
__forceinline__ __device__ float blockReduceSum(float val) {

    // --- The shared memory is appointed to contain the warp reduction results. It is understood that the maximum number of threads per block will be
    //     1024, so that there will be at most 32 warps per each block.
    static __shared__ float shared[32]; 

    int lane    = threadIdx.x % warpSize;   // Thread index within the warp
    int wid     = threadIdx.x / warpSize;   // Warp ID

    // --- Performing warp reduction. Only the threads with 0 index within the warp have the "val" value set with the warp reduction result
    val = warpReduceSum(val);     

    // --- Only the threads with 0 index within the warp write the warp result to shared memory
    if (lane==0) shared[wid]=val;   // Write reduced value to shared memory

    // --- Wait for all warp reductions
    __syncthreads();              

    // --- There will be at most 1024 threads within a block and at most 1024 blocks within a grid. The partial sum is read from shared memory only 
    //     the corresponding warp existed, otherwise the partial sum is set to zero.
    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;

    // --- The first warp performs the final partial warp summation. 
    if (wid==0) val = warpReduceSum(val); 

    return val;
}

/********************/
/* REDUCTION KERNEL */
/********************/
__global__ void deviceReduceKernel(float *in, float* out, int N) {

    float sum = 0.f;

    // --- Reduce multiple elements per thread.
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) sum += in[i];

    sum = blockReduceSum(sum);

    if (threadIdx.x==0) out[blockIdx.x]=sum;
}

/********/
/* MAIN */
/********/
void main() {

    const int N = 200000;

    thrust::host_vector<float> h_out(N,0.f);

    thrust::device_vector<float> d_in(N,3.f);
    thrust::device_vector<float> d_out(N);

    int threads = 512;
    int blocks = min((N + threads - 1) / threads, 1024);

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // --- Performs the block reduction. It returns an output vector containig the block reductions as elements
    cudaEventRecord(start, 0);
    deviceReduceKernel<<<blocks, threads>>>(thrust::raw_pointer_cast(d_in.data()), thrust::raw_pointer_cast(d_out.data()), N);
    // --- Performs a second block reduction with only one block. The input is an array of all 0's, except the first elements which are the
    //     block reduction results of the previous step.
    deviceReduceKernel<<<1, 1024>>>(thrust::raw_pointer_cast(d_out.data()), thrust::raw_pointer_cast(d_out.data()), blocks);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("CUDA Shuffle - elapsed time:  %3.5f ms \n", time);      
    h_out = d_out;

    cudaEventRecord(start, 0);
    float sum = thrust::reduce(d_in.begin(),d_in.end(),0.f,thrust::plus<float>());
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("CUDA Thrust - elapsed time:  %3.5f ms \n", time);       

    printf("Shuffle result = %f\n",h_out[0]);
    printf("Thrust result = %f\n",sum);

    getchar();

}