C++ 将2D推力::设备_向量复矩阵传递给CUDA核函数

C++ 将2D推力::设备_向量复矩阵传递给CUDA核函数,c++,matrix,cuda,complex-numbers,thrust,C++,Matrix,Cuda,Complex Numbers,Thrust,我是Cuda的新手,我正在尝试使用Cuda将我现有的项目迁移到GPU。 我的代码基于复杂矩阵和复杂缓冲区 在第一步中,我尝试将嵌套For循环代码移动到Cuda(其余将类似): 我认为造成麻烦的理由是“d_tw”。 因此,我的问题是: 我对to>(从2d矩阵到一个展平arr)的转换有什么错 CUDA中是否有更好的乳清处理二维复数 Cuda中关于复杂阵列的文档非常糟糕,我在哪里可以阅读大量关于Cuda复杂矩阵的工作 谢谢 有各种各样的问题。我会列出一些,可能会遗漏一些。因此,请参考我给出的示例代码以

我是Cuda的新手,我正在尝试使用Cuda将我现有的项目迁移到GPU。 我的代码基于复杂矩阵和复杂缓冲区

在第一步中,我尝试将嵌套For循环代码移动到Cuda(其余将类似):

我认为造成麻烦的理由是“d_tw”。 因此,我的问题是:

  • 我对to>(从2d矩阵到一个展平arr)的转换有什么错
  • CUDA中是否有更好的乳清处理二维复数
  • Cuda中关于复杂阵列的文档非常糟糕,我在哪里可以阅读大量关于Cuda复杂矩阵的工作

  • 谢谢

    有各种各样的问题。我会列出一些,可能会遗漏一些。因此,请参考我给出的示例代码以了解其他差异

  • 最紧迫的问题是:

    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
    
  • 这就是导致您看到的无效参数错误的原因。在引擎盖下面,推力将尝试为此使用
    cudaMemcpyAsync
    操作,因为这本质上是从主机到设备的拷贝。我们将用一个普通的
    cudaMemcpy
    操作替换它来解决这个问题,但是要理解如何构造它,必须理解第2项

  • 您似乎认为向量向量意味着连续存储。这种说法并不是针对主旨的。由于向量的
    stress::host_向量
    (甚至向量的
    std::vector
    )并不意味着连续存储,因此我们无法轻松构造单个操作,例如
    cudaMemcpy
    stress::copy
    ,以复制此数据。因此,有必要明确地将其展平

  • 您在
    cudaMemcpy
    操作上的复制方向通常是向后的。您本应拥有
    cudamemcpyHost设备的位置
    您拥有
    cudaMemcpyDeviceToHost
    ,反之亦然

  • CUDA
    cuComplex.h
    头文件早于推力文件,是为处理复数的快速C风格方法提供的。没有相关文档-您必须阅读文件本身并了解如何使用它,就像已经做过的那样。然而,由于您使用的是
    asch::complex
    ,因此只需使用这种编码范式,并编写与主机代码几乎完全相同的设备代码就简单得多了

  • 您有各种不同的传输大小错误
    cudaMemcpy
    采用要传输的字节大小

  • 下面是一个示例,由您展示的片段拼凑而成,带有各种“修复”。我并不是说它在任何方面都是完美或正确的,但它避免了我上面概述的问题。此外,根据使用或使用
    -DUSE_内核
    define编译的方式,它将运行“原始”主机代码并显示输出,或者运行内核代码并显示输出。根据我的测试,输出匹配

    $ cat t1751.cu
    #include <thrust/complex.h>
    #include <thrust/copy.h>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <iostream>
    #include <cstdint>
    #include <cuComplex.h>
    
    typedef thrust::complex<double> smp_t;
    __global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
        unsigned int ch = threadIdx.x;
        unsigned int k = blockIdx.x;
    
         for (int x = 0; x < block_size; ++x) {
                unsigned int sig_index = k*block_size+x;
                unsigned int tw_index = ch*k;
                unsigned int cn_index = ch*block_size+x;
    
    
                cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
                cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
         }
    }
    __global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
        unsigned row = blockIdx.x;
        unsigned col = threadIdx.x;
        unsigned idx = row*block_size+col;
        for (int k = 0; k < 8; k++)
          cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
    }
    
    void kernel_wrap(
                smp_t *cnbuf,
                smp_t *sgbuf,
                thrust::host_vector<thrust::host_vector<smp_t>>tw,
                size_t buffer_size) {
        smp_t *d_sgbuf;
        smp_t *d_cnbuf;
        thrust::device_vector<smp_t> d_tw(8*8);
    //    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
        thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
        for (int i = 0; i < buffer_size; i++)
          for (int j = 0; j < buffer_size; j++)
            htw[i*buffer_size + j] = tw[i][j];
    
        cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
        cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
        cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));
    
        cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
        cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
    
        thrust::raw_pointer_cast(d_tw.data());
    
        kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);
    
        cudaError_t varCudaError1 = cudaGetLastError();
        if (varCudaError1 != cudaSuccess)
        {
                std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
                exit(EXIT_FAILURE);
        }
    
    //    cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
        cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
        for (int i = 0; i < 8; i++)
          for (int j = 0; j < 8; j++)
            std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
    }
    
    int main(){
      const int bufsize = 8;
      const int decfactor = 8;
    
      uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
      smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
      smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
      memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
     // Create matrix.
     thrust::complex<double> i_unit(0.0, 1.0);
    #ifndef USE_KERNEL
     std::vector<std::vector<smp_t> > tw(decfactor);
    #else
     thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
    #endif
    
      // Fill the Matrix
      for (size_t row = 0; row < 8; row++) {
           for (size_t col = 0; col < 8; col++) {
                  std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
                  tw[row].push_back(tmp);
          }
      }
      thrust::complex<double> test(1.0, 1.0);
      for (int i = 0; i < 8*8; i++) sgbuf[i]  = test;
    #ifndef USE_KERNEL
    /* The Code To Move to the GPU processing */
    for (unsigned int i = 0; i < bufsize; i++) {
            for (size_t ch = 0; ch < 8; ch++)
                    for (size_t k = 0; k < 8; k++)
                            cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
    }
        for (int i = 0; i < 8; i++)
          for (int j = 0; j < 8; j++)
            std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
    #else
    
      kernel_wrap(cnbuf,sgbuf,tw,bufsize);
    #endif
    
    }
    $ nvcc -o t1751 t1751.cu -std=c++11
    $ ./t1751 >out_host.txt
    $ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
    $ ./t1751 >out_device.txt
    $ diff out_host.txt out_device.txt
    $
    
    $cat t1751.cu
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    类型定义推力:复杂smp_t;
    __全局无效内核函数旧(cuDoubleComplex*cnbuf,cuDoubleComplex*sgbuf,smp\u t*tw,size\u t block\u size){
    无符号int ch=threadIdx.x;
    无符号整数k=blockIdx.x;
    对于(int x=0;xcout推力容器仅适用于吊舱类型。不要尝试使用向量向量。它不会工作的!非常感谢!!你真的帮了我!!!:)
    
    Failed to launch subDelimiterExamine kernel (error code: invalid argument)!
    
    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
    
    $ cat t1751.cu
    #include <thrust/complex.h>
    #include <thrust/copy.h>
    #include <thrust/device_vector.h>
    #include <thrust/host_vector.h>
    #include <iostream>
    #include <cstdint>
    #include <cuComplex.h>
    
    typedef thrust::complex<double> smp_t;
    __global__ void kernel_func_old(cuDoubleComplex *cnbuf, cuDoubleComplex *sgbuf, smp_t *tw, size_t block_size) {
        unsigned int ch = threadIdx.x;
        unsigned int k = blockIdx.x;
    
         for (int x = 0; x < block_size; ++x) {
                unsigned int sig_index = k*block_size+x;
                unsigned int tw_index = ch*k;
                unsigned int cn_index = ch*block_size+x;
    
    
                cuDoubleComplex temp = cuCmul(sgbuf[sig_index], make_cuDoubleComplex(tw[tw_index].real(), tw[tw_index].imag()));
                cnbuf[cn_index] = cuCadd(temp, cnbuf[cn_index]);
         }
    }
    __global__ void kernel_func(smp_t *cnbuf, smp_t *sgbuf, smp_t *tw, size_t block_size) {
        unsigned row = blockIdx.x;
        unsigned col = threadIdx.x;
        unsigned idx = row*block_size+col;
        for (int k = 0; k < 8; k++)
          cnbuf[idx] += sgbuf[k*block_size+col] * tw[row*block_size+k];
    }
    
    void kernel_wrap(
                smp_t *cnbuf,
                smp_t *sgbuf,
                thrust::host_vector<thrust::host_vector<smp_t>>tw,
                size_t buffer_size) {
        smp_t *d_sgbuf;
        smp_t *d_cnbuf;
        thrust::device_vector<smp_t> d_tw(8*8);
    //    thrust::copy(&tw[0][0], &tw[7][7], d_tw.begin());
        thrust::host_vector<smp_t> htw(buffer_size*buffer_size);
        for (int i = 0; i < buffer_size; i++)
          for (int j = 0; j < buffer_size; j++)
            htw[i*buffer_size + j] = tw[i][j];
    
        cudaMemcpy(thrust::raw_pointer_cast(d_tw.data()), &htw[0], 8*8*sizeof(smp_t), cudaMemcpyHostToDevice);
        cudaMalloc((void **)&d_sgbuf, buffer_size*buffer_size*sizeof(smp_t));
        cudaMalloc((void **)&d_cnbuf, buffer_size*buffer_size*sizeof(smp_t));
    
        cudaMemcpy(d_sgbuf, sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
        cudaMemcpy(d_cnbuf, cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyHostToDevice);
    
        thrust::raw_pointer_cast(d_tw.data());
    
        kernel_func<<<8, 8>>>(d_cnbuf,d_sgbuf,thrust::raw_pointer_cast(d_tw.data()),buffer_size);
    
        cudaError_t varCudaError1 = cudaGetLastError();
        if (varCudaError1 != cudaSuccess)
        {
                std::cout << "Failed to launch subDelimiterExamine kernel (error code: " << cudaGetErrorString(varCudaError1) << ")!" << std::endl;
                exit(EXIT_FAILURE);
        }
    
    //    cudaMemcpy(sgbuf, d_sgbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
        cudaMemcpy(cnbuf, d_cnbuf, buffer_size*buffer_size*sizeof(smp_t), cudaMemcpyDeviceToHost);
        for (int i = 0; i < 8; i++)
          for (int j = 0; j < 8; j++)
            std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
    }
    
    int main(){
      const int bufsize = 8;
      const int decfactor = 8;
    
      uint8_t *binbuffer = (uint8_t*) malloc(8 * bufsize * sizeof(uint8_t));
      smp_t *sgbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
      smp_t *cnbuf = (smp_t*) malloc(8 * bufsize * sizeof(smp_t));
      memset(cnbuf, 0, 8*bufsize*sizeof(smp_t));
     // Create matrix.
     thrust::complex<double> i_unit(0.0, 1.0);
    #ifndef USE_KERNEL
     std::vector<std::vector<smp_t> > tw(decfactor);
    #else
     thrust::host_vector<thrust::host_vector<smp_t>> tw(decfactor);
    #endif
    
      // Fill the Matrix
      for (size_t row = 0; row < 8; row++) {
           for (size_t col = 0; col < 8; col++) {
                  std::complex<double> tmp = exp(-i_unit * 2.0*M_PI * ((double) col*row) / (double)8);
                  tw[row].push_back(tmp);
          }
      }
      thrust::complex<double> test(1.0, 1.0);
      for (int i = 0; i < 8*8; i++) sgbuf[i]  = test;
    #ifndef USE_KERNEL
    /* The Code To Move to the GPU processing */
    for (unsigned int i = 0; i < bufsize; i++) {
            for (size_t ch = 0; ch < 8; ch++)
                    for (size_t k = 0; k < 8; k++)
                            cnbuf[ch*bufsize + i] += sgbuf[k*bufsize+i] * tw[ch].at(k);
    }
        for (int i = 0; i < 8; i++)
          for (int j = 0; j < 8; j++)
            std::cout << cnbuf[i*8+j].real() << "," << cnbuf[i*8+j].imag() << std::endl;
    #else
    
      kernel_wrap(cnbuf,sgbuf,tw,bufsize);
    #endif
    
    }
    $ nvcc -o t1751 t1751.cu -std=c++11
    $ ./t1751 >out_host.txt
    $ nvcc -o t1751 t1751.cu -std=c++11 -DUSE_KERNEL
    $ ./t1751 >out_device.txt
    $ diff out_host.txt out_device.txt
    $