C++ 大步与洗牌减少_C++_Cuda - Fatal编程技术网

C++ 大步与洗牌减少

c++ cuda

C++ 大步与洗牌减少,c++,cuda,C++,Cuda,我最近观看了CppCon关于使用Clang编译CUDA CUDA代码的讨论，演讲者在讨论了一点架构之后实现了一个求和缩减。我对他采用的方法很感兴趣，这种方法通过shfl对块中的元素进行缩减，因此在没有工作示例的情况下，我使用他的代码对其进行了一点修改，得到了最大缩减问题是，与CPU实现在2^22个元素中查找最大值相比，最大值减少的速度非常慢，我得到的时间约为90毫秒，而不是20毫秒。以下是shfl减少的代码 #include <vector> #include <cuda.

我最近观看了CppCon关于使用Clang编译CUDA CUDA代码的讨论，演讲者在讨论了一点架构之后实现了一个求和缩减。我对他采用的方法很感兴趣，这种方法通过shfl对块中的元素进行缩减，因此在没有工作示例的情况下，我使用他的代码对其进行了一点修改，得到了最大缩减

问题是，与CPU实现在2^22个元素中查找最大值相比，最大值减少的速度非常慢，我得到的时间约为90毫秒，而不是20毫秒。以下是shfl减少的代码

#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

using namespace std;

// Global reduce test
__global__ void d_max_reduce(const int *in, int *out, size_t N) {
    int sum = 0;
    size_t start = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
    for (size_t i = start; i < start + 4 && i < N; i++) {
        sum = max(__ldg(in + i), sum);
    }

    for (int i = 16; i; i >>= 1) {
        sum = max(__shfl_down(sum, i), sum);
    }

    __shared__ int shared_max;
    shared_max = 0;
    __syncthreads();

    if (!(threadIdx.x % 32)) {
        atomicMax(&shared_max, sum);
    }

    __syncthreads();

    if (!threadIdx.x) {
        atomicMax(out, shared_max);
    }
}

int test_max_reduce(std::vector<int> &v) {
    int *in, *out;

    cudaMalloc(&in, v.size() * sizeof(int));
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(out, 0, sizeof(int));

    int threads = 256;
    d_max_reduce<<<ceil((float)v.size() / (threads * 4)), threads>>>(in, out, v.size());

    int res;
    cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(in);
    cudaFree(out);
    return res;
}

#include <vector>

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_functions.h>
#include <cuda_runtime_api.h>

__global__ void d_max_reduction(const int *in, int *out, size_t N) {
    extern __shared__ int s_data[];

    size_t tid = threadIdx.x;
    size_t i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < N)
        s_data[tid] = in[i];
    else
        s_data[tid] = 0;
    __syncthreads();

    for (size_t s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s)
            s_data[tid] = max(s_data[tid], s_data[tid + s]);
        __syncthreads();
    }

    if (!tid)
        atomicMax(out, s_data[0]);
}

int test_max_reduction(std::vector<int> &v) {
    int *in;
    int *out;

    cudaMalloc(&in, v.size() * sizeof(int));
    cudaMalloc(&out, sizeof(int));
    cudaMemcpy(in, v.data(), v.size() * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemset(out, 0, sizeof(int));

    int threads = 128;

    d_max_reduction<<<ceil((float)v.size() / threads),
                      threads,
                      threads * sizeof(int)>>>(in, out, v.size());

    int res;
    cudaMemcpy(&res, out, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(in);
    cudaFree(out);

    return res;
}

预热后编辑新计时

GPU (shfl) Test 
 Max Value : 16000000
 Time      : 4
GPU strided Test 
 Max Value : 16000000
 Time      : 6
CPU Test 
 Max Value : 16000000
 Time      : 23

因此，我更一般的问题是，为什么shfl版本比Strated版本慢？可分为

我是否在发射参数中遗漏了什么/做了什么/假设了什么错误

什么时候建议在跨步循环上使用shfl内在特性，反之亦然

我认为您在第一次计时测量中获得了CUDA启动开销。您没有遵循好的基准技术。在对所有3种方法进行热身后，重新运行每种方法并在那里计时。@RobertCrovella测试了它，您完全正确，谢谢！。如果你愿意，你可以回答你自己的问题。我会投一个明智的答案。@RobertCrovella我更愿意接受你的答案：）。请注意，这种缩减的良好实现的性能纯粹是内存带宽受限的，除了在开始和结束时的一些恒定开销之外。因此，应该努力通过使用支持的最大大小（

int4

）的合并内存读取来最大化内存吞吐量，并在这些读取上扩展缩减循环，以便对大型阵列而言，所有线程的最终缩减可以忽略不计。

#ifndef TIMER_HPP
#define TIMER_HPP

#include <chrono>
#include <string>
#include <iostream>

template <typename F, typename ...Args>
void measure(std::string msg, F func, Args&&... args) {
    auto start = std::chrono::steady_clock::now();
    int val = func(std::forward<Args>(args)...);
    auto end = std::chrono::steady_clock::now();

    std::cout << msg << " Test " << std::endl;
    std::cout << " Max Value : " << val << std::endl;
    std::cout << " Time      : ";
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>
                 (end - start).count() << std::endl;
}

#endif // TIMER_HPP

GPU (shfl) Test 
 Max Value : 15999999
 Time      : 86
GPU strided Test 
 Max Value : 15999999
 Time      : 7
CPU Test 
 Max Value : 15999999
 Time      : 23

GPU (shfl) Test 
 Max Value : 16000000
 Time      : 4
GPU strided Test 
 Max Value : 16000000
 Time      : 6
CPU Test 
 Max Value : 16000000
 Time      : 23