CUDA中的动态分配阵列拷贝

CUDA中的动态分配阵列拷贝,cuda,Cuda,我们有什么方法可以复制CUDA内核中的数组吗 例如: \uuuuu设备\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu; __设备\uuuuvoid copyData(浮点*输入,浮点*输出){} 我想把一些输入数组中的数据复制到满足某些条件的输出数组中,并将复制的元素数复制到元素数 谢谢。是的,你可以写一本 例如,您可以按照此答案中的方式执行,只需跳过smem部分 //assumes sizeof(T) is multi

我们有什么方法可以复制CUDA内核中的数组吗

例如:

\uuuuu设备\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu;
__设备\uuuuvoid copyData(浮点*输入,浮点*输出){}

我想把一些输入数组中的数据复制到满足某些条件的输出数组中,并将复制的元素数复制到元素数


谢谢。

是的,你可以写一本

例如,您可以按照此答案中的方式执行,只需跳过smem部分

//assumes sizeof(T) is multiple of sizeof(int) and is aligned to at least alignof(int)
//assumes single-dimention kernel
//assumes it is launched for all threads in block
template <typename T>
__device__ void memCopy(T* dest, T* src, size_t size) {
    int* iDest = (int*)dest;
    int* iSrc = (int*)src;
    for(size_t i = threadIdx.x; i<size*sizeof(T)/sizeof(int); i+=blockDim.x)
        iDest[i] = iSrc[i];
    __syncthreads();
}

您真正描述的是流压缩。推力库有一系列内置的流压缩函数,可以从内核中调用这些函数。举个简单的例子:

#include <iostream>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>

struct op
{
  __host__ __device__
  bool operator()(const int x) { return (x % 3) == 0; }
};

__global__ void kernel(int* input, int* output, int Nin, int* Nout)
{
    auto output_end = thrust::copy_if(thrust::device, input, input + Nin, output, op());
    *Nout = output_end - output;
}

int main()
{
    const int N = 10;
    const size_t sz = sizeof(int) * size_t(N);

    int* in;
    cudaMallocManaged((void **)&in, sz);
    int* out;
    cudaMallocManaged((void **)&out, sz);
    int* Nout;
    cudaMallocManaged((void **)&Nout, sizeof(int));

    for(int i=0; i<N; i++) {
        in[i] = 1+i;
        out[i] = -1;
    }

    kernel<<<1,1>>>(in, out, N, Nout);
    cudaDeviceSynchronize();

    for(int i=0; i < *Nout; i++) {
        std::cout << i << " " << out[i] << std::endl;
    }

    return 0;
}

这可能是在内核中对少量数据执行流压缩的一种快速而简单的方法。如果您有大量数据,那么使用来自主机的推力并让推力代表您运行内核可能更有意义。

如果存在多个运行块,则该代码会被破坏。对于小于sizeof int的类型,由于潜在的对齐错误,该代码也会被破坏。memcpy一直在设备代码中使用。
#include <iostream>
#include <thrust/copy.h>
#include <thrust/execution_policy.h>

struct op
{
  __host__ __device__
  bool operator()(const int x) { return (x % 3) == 0; }
};

__global__ void kernel(int* input, int* output, int Nin, int* Nout)
{
    auto output_end = thrust::copy_if(thrust::device, input, input + Nin, output, op());
    *Nout = output_end - output;
}

int main()
{
    const int N = 10;
    const size_t sz = sizeof(int) * size_t(N);

    int* in;
    cudaMallocManaged((void **)&in, sz);
    int* out;
    cudaMallocManaged((void **)&out, sz);
    int* Nout;
    cudaMallocManaged((void **)&Nout, sizeof(int));

    for(int i=0; i<N; i++) {
        in[i] = 1+i;
        out[i] = -1;
    }

    kernel<<<1,1>>>(in, out, N, Nout);
    cudaDeviceSynchronize();

    for(int i=0; i < *Nout; i++) {
        std::cout << i << " " << out[i] << std::endl;
    }

    return 0;
}
$ nvcc -std=c++11 -arch=sm_52 thrust_device_compact.cu 
$ ./a.out 
0 3
1 6
2 9