C++ 在CUDA 8+；中使用默认推力创建自展开CUDA n dim同类型元组；？_C++_Templates_Cuda_Metaprogramming_Thrust

C++ 在CUDA 8+；中使用默认推力创建自展开CUDA n dim同类型元组；？

c++ templates cuda

C++ 在CUDA 8+；中使用默认推力创建自展开CUDA n dim同类型元组；？,c++,templates,cuda,metaprogramming,thrust,C++,Templates,Cuda,Metaprogramming,Thrust,我试图展开一个函数的实现，以便在cuda中执行优化。基本上，我有一块共享内存，这本来会减慢我的代码速度，通过“展开”我的实现（减少总线程数，每个线程做两倍的工作），我能够获得可观的性能提升。我想看看我是否可以通过更多的展开来管理更多的性能提升，但是为了实现这一点，我大量使用了元组。我发现在这个过程中有很多代码重复，我想减少重复下面是我的代码中经常发生的一个例子： __device__ thrust::tuple<T,T,T,...> foo(thrust::tuple<G,G

我试图展开一个函数的实现，以便在cuda中执行优化。基本上，我有一块共享内存，这本来会减慢我的代码速度，通过“展开”我的实现（减少总线程数，每个线程做两倍的工作），我能够获得可观的性能提升。我想看看我是否可以通过更多的展开来管理更多的性能提升，但是为了实现这一点，我大量使用了元组。我发现在这个过程中有很多代码重复，我想减少重复

下面是我的代码中经常发生的一个例子：

__device__
thrust::tuple<T,T,T,...> foo(thrust::tuple<G,G,G..> choice_arg...){
    //all do the same thing, with very similar args as well.
    T value1 = someoperation(thrust::get<0>(choice_arg),...);
    T value2 = someoperation(thrust::get<1>(choice_arg),...);
    T value3 = someoperation(thrust::get<2>(choice_arg),...);
    ...
    return thrust::make_tuple(value1, value2, value3,...);
}

\u设备__
推力：：元组foo（推力：：元组选择参数…）{
//它们都做同样的事情，args也非常相似。
T value1=someoperation（推力：：获取（选项参数），…）；
T value2=someoperation（推力：：获取（选项参数），…）；
T value3=someoperation（推力：：获取（选项参数），…）；
...
返回推力：：生成元组（值1、值2、值3等）；
}

我不想把所有的锅炉板都写在这里，我只想有一个这样的函数：

__device__
thrust::tuple<T,T,T,...> foo(thrust::tuple<G,G,G..> choice_arg, ...){
    return someoperation<CHOICE_ARG_LENGTH>(choice_arg,...);
}

\u设备__
推力：：元组foo（推力：：元组选择参数，…）{
返回someoperation（选项参数，…）；
}

我已经看到了类似的东西是如何起作用的，但是如果我需要返回

推力：：tuple

，那么普通的模板循环将不起作用。如果推力有

推力：：tuple_cat

的话，这个解决方案就行了，但是他们还没有合并可变模板元组，尽管2014年的工作已经完成，我甚至找不到任何关于合并cat实现的对话！那么，有没有可能在GPU上实现我想要的行为而不使用一个推力：：tuple_cat实现呢

请注意，我不能使用数组来实现这一点，在最初使用数组之后，我发现我免费获得了%15的速度提升，这在VisualProfiler和我的算法的真实应用程序中都可以看到。代码对性能非常关键

如果您可以使用CUDA 9和c++14，您可以执行以下操作，有关详细信息，请参见例如

这对于C++14来说是有意义的，但我正在尝试使用CUDA8ATM，在我能够安全地转到CUDA9之前还需要一段时间。我甚至没有意识到尾随返回类型可以在lambdas之外使用，我不确定这是如何应用的。另外，我看了其他人是如何实现的，但它看起来不像是同一种语言，我不知道他们是如何做任何事情的，例如，为什么他们在任何地方都使用sizeof，或者为什么定义了size（），但从未使用过。好的，没问题。我将添加一个c++11版本，并进行一些解释。我升级到cuda 9.1，使用了您的第一个版本，它工作正常。您提到使用数组速度较慢。我想你说的是在阵列上运行时循环？因为在您的示例中，用数组替换元组在性能上应该是相同的。@没有，为我的应用程序静态分配的数组产生了不同的代码，这会更慢。这对我来说毫无意义，但基准测试并没有撒谎，我用我的实际应用程序和可视化分析器来确保。在展开的版本中？@havget我首先展开，获得了性能提升，然后我将展开更改为沿着元组，在没有任何其他更改的情况下，我又得到了一个性能嘘声。我明白了。。。对于展开的元组，您是否尝试过c++11版本？

#include <iostream>
#include <utility>
#include <thrust/tuple.h>

template <typename T>
__device__ T some_operation(T a) {
  return a + 1;  // do something smart
}

template <typename T, std::size_t... I>
__device__ auto foo_impl(const T& t, std::index_sequence<I...>) {
  return thrust::make_tuple(some_operation(thrust::get<I>(t))...);
}

template <typename Tuple>
__device__ auto foo(const Tuple& t) {
  return foo_impl(t,
                  std::make_index_sequence<thrust::tuple_size<Tuple>::value>());
}

__global__ void test_kernel() {
  auto result = foo(thrust::make_tuple(3., 2, 7));
  printf("%f, %d, %d\n", thrust::get<0>(result), thrust::get<1>(result),
         thrust::get<2>(result));
}

int main() {
  test_kernel<<<1, 1>>>();
  cudaDeviceSynchronize();
}

#include <iostream>
#include <thrust/tuple.h>

namespace compat {
template <size_t... Indices>
struct index_sequence {};

namespace detail {
template <size_t N, typename Seq = index_sequence<>>
struct make_index_sequence_impl;

template <size_t N, size_t... Indices>
struct make_index_sequence_impl<N, index_sequence<Indices...>> {
  using type = typename make_index_sequence_impl<
      N - 1, index_sequence<N - 1, Indices...>>::type;
};

template <size_t... Indices>
struct make_index_sequence_impl<1, index_sequence<Indices...>> {
  using type = index_sequence<0, Indices...>;
};
}

template <size_t N>
using make_index_sequence = typename detail::make_index_sequence_impl<N>::type;
}

template <typename T>
__device__ T some_operation(T a) {
  return a + 1;  // do something smart
}

template <typename T, std::size_t... I>
__device__ auto foo_impl(const T& t, compat::index_sequence<I...>)
    -> decltype(thrust::make_tuple(some_operation(thrust::get<I>(t))...)) {
  return thrust::make_tuple(some_operation(thrust::get<I>(t))...);
}

template <typename Tuple>
__device__ auto foo(const Tuple& t) -> decltype(foo_impl(
    t, compat::make_index_sequence<thrust::tuple_size<Tuple>::value>())) {
  return foo_impl(
      t, compat::make_index_sequence<thrust::tuple_size<Tuple>::value>());
}

__global__ void test_kernel() {
  auto result = foo(thrust::make_tuple(3., 2, 7));
  printf("%f, %d, %d\n", thrust::get<0>(result), thrust::get<1>(result),
         thrust::get<2>(result));
}

int main() {
  test_kernel<<<1, 1>>>();
  cudaDeviceSynchronize();
}