CUDA cub:：DeviceScan和temp_storage_bytes参数_Cuda_Cub

CUDA cub:：DeviceScan和temp_storage_bytes参数

cuda

CUDA cub:：DeviceScan和temp_storage_bytes参数,cuda,cub,Cuda,Cub,我使用的是cub:：DeviceScan函数，它有一个参数temp\u storage\u bytes，它使用该参数分配内存（顺便说一句，代码段永远不会释放内存）代码段使用指向NULL内存的指针调用cub:：DeviceScan函数，该指针触发它来计算函数所需的临时设备内存量，然后返回。使用cudamaloc分配必要的临时内存，并指向该内存重复函数调用。然后使用cudaFree释放临时内存（或者可能应该释放）我在不同的浮点数组上重复了很多次设备扫描，但每个浮点数组的长度都是相同的我的问题是

我使用的是

cub:：DeviceScan

函数，它有一个参数

temp\u storage\u bytes

，它使用该参数分配内存（顺便说一句，代码段永远不会释放内存）

代码段使用指向

NULL

内存的指针调用

cub:：DeviceScan

函数，该指针触发它来计算函数所需的临时设备内存量，然后返回。使用

cudamaloc

分配必要的临时内存，并指向该内存重复函数调用。然后使用

cudaFree

释放临时内存（或者可能应该释放）

我在不同的浮点数组上重复了很多次设备扫描，但每个浮点数组的长度都是相同的

我的问题是，我可以假设

temp\u storage\u bytes

总是相同的值吗？如果是这样，我就可以对许多函数调用执行单个

cudamaloc

和单个

cudaFree

该示例不清楚所需内存是如何确定的，以及对于给定长度的给定数组，它是否可以更改。

您可以假设只需要一次调用

cub:：DeviceScan:：InclusiveScan

即可确定如果重复调用

cub:：DeviceScan:：InclusiveScan

在相同长度的不同数组上进行扫描。在下面的示例中，我在相同长度的不同数组上多次调用

cub:：DeviceScan:：InclusiveScan

，并且只使用一次调用

cub:：DeviceScan:：InclusiveScan

来确定临时大小的大小-

// Ensure printing of CUDA runtime errors to console
#define CUB_STDERR

#include <stdio.h>
#include <algorithm> // std::generate

#include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
#include <thrust\device_vector.h>
#include <thrust\host_vector.h>

void main(void)
{

    // Declare, allocate, and initialize device pointers for input and output
    int  num_items = 7;

    thrust::device_vector<int> d_in(num_items);
    thrust::device_vector<int> d_out(num_items);

    // Determine temporary device storage requirements for inclusive prefix sum
    void     *d_temp_storage = NULL;
    size_t   temp_storage_bytes = 0;

    cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);

    // Allocate temporary storage for inclusive prefix sum
    cudaMalloc(&d_temp_storage, temp_storage_bytes);

    for (int k=0; k<10; k++) {

        thrust::host_vector<int> h_in(num_items);

        thrust::host_vector<int> h_out(num_items,0);

        std::generate(h_in.begin(), h_in.end(), rand);
        d_in = h_in;

         // Run inclusive prefix sum
         cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in.data(), d_out.data(), num_items);

         int difference = 0;
         int prev = 0;
         for (int i=0; i<num_items; i++) {
              h_out[i] = prev + h_in[i];
              prev = h_out[i];
              int val = d_out[i];
              printf("%i %i %i %i\n",i,difference,h_out[i],d_out[i]);
              difference = difference + abs(h_out[i] - d_out[i]);
         }

         if (difference == 0) printf("Test passed!\n");
         else printf("A problem occurred!\n");

         h_in.shrink_to_fit();
         h_out.shrink_to_fit();

    }

    getchar();

}

//确保将CUDA运行时错误打印到控制台
#定义CUB_STDERR
#包括
#include//std:：generate
#包括//或同等
#包括
#包括
真空总管（真空）
{
//为输入和输出声明、分配和初始化设备指针
int num_items=7；
推力：：设备矢量d_in（num_项）；
推力：设备矢量输出（数量项）；
//确定包含前缀和的临时设备存储要求
void*d_temp_storage=NULL；
大小\u t温度\u存储\u字节=0；
cub:：DeviceScan:：InclusiveSum（数据临时存储、临时存储字节、数据输入（）、数据输出（）、数量项）；
//为包含前缀和分配临时存储
cudaMalloc（&d_临时存储，临时存储字节）；
for（int k=0；kI会说是。否则这意味着temp_storage_bytes
取决于数组元素值？为什么每次循环都要释放d_temp_存储？