C++ 我用cudaMemcpy错了吗？_C++_Cuda

C++ 我用cudaMemcpy错了吗？

c++ cuda

C++ 我用cudaMemcpy错了吗？,c++,cuda,C++,Cuda,我已经实现了这个CuArray，通过实现属性行和列，简化了数组的使用： #include <cuda_runtime_api.h> #include <cuda.h> template<class TType> class CuArray { public: int Rows; int Columns; int Elements; TType *ArrayPointer; CuArray<TType>(i

我已经实现了这个

CuArray

，通过实现属性

行

和

列

，简化了数组的使用：

#include <cuda_runtime_api.h>
#include <cuda.h>
template<class TType>
class CuArray
{
public:

    int Rows;
    int Columns;
    int Elements;
    TType *ArrayPointer;

    CuArray<TType>(int rows, int columns = 1)
    {
        this->Rows = rows;
        this->Columns = columns;
        Elements = this->Rows * this->Columns;

        cudaMalloc(&this->ArrayPointer, sizeof(TType)*this->Elements);
    }

    static CuArray<TType>* GpuCreate(int rows, int columns = 1)
    {
        CuArray<TType>* cuArray = new CuArray<TType>(rows, columns);
        CuArray<TType>* gpuCuArray;
        size_t size = sizeof(CuArray<TType>);
        cudaMalloc(&gpuCuArray, size);
        cudaMemcpy(gpuCuArray, cuArray, size, cudaMemcpyHostToDevice);
        return gpuCuArray;
    }
};

#包括
#包括
模板
类CU数组
{
公众：
int行；
int列；
int元素；
t类型*阵列指针；
CuArray（整数行，整数列=1）
{
这->行=行；
此->列=列；
元素=此->行*此->列；
cudamaloc（&this->ArrayPointer，sizeof（TType）*this->Elements）；
}
静态CuArray*GpuCreate（整数行，整数列=1）
{
CuArray*CuArray=新CuArray（行、列）；
CuArray*gpuCuArray；
size\u t size=sizeof（CuArray）；
Cudamaloc（&gpuCuArray，尺寸）；
cudaMemcpy（gpuCuArray、cuArray、size、cudamemcpyhostodevice）；
返回gpuCuArray；
}
};

然而，

cudaMemcpy

似乎没有按预期工作，我不知道我做错了什么

这是用于调用的变量的值（和指针位置），例如

CuArray:：GpuCreate（11）：
使用Nsight Eclipse 7.5、Ubuntu 14.04 64位进行调试

cuArray={0xb6e8b0，行=11，列=1，元素=11}

大小=32

gpuCuArray={0x7053e3600，行=0，列=0，元素=0}
new
和cudamaloc
之后的指针值对我来说很好，但是cudaMemcpy
似乎不起作用
那么我做错了什么呢？
通常，下面的代码应该足以表示存储在GPU中的二维数组。您不需要将其行
、列等存储在设备内存中。这些信息通常只需要来自主机端。但如果不是您的情况，您可能需要描述有关设计考虑的更多细节。一段代码演示了如何使用CuArray
对象
#include <cuda_runtime_api.h>
#include <cuda.h>
template<class TType>
class CuArray
{
public:

    int Rows;
    int Columns;
    int Elements;
    TType *ArrayPointer;

    CuArray<TType>(int rows, int columns = 1)
    {
        this->Rows = rows;
        this->Columns = columns;
        Elements = this->Rows * this->Columns;

        cudaMalloc(&this->ArrayPointer, sizeof(TType)*this->Elements);
    }

    static CuArray<TType>* GpuCreate(int rows, int columns = 1)
    {
        CuArray<TType>* cuArray = new CuArray<TType>(rows, columns);
        return cuArray;
    }
};

#包括
#包括
模板
类CU数组
{
公众：
int行；
int列；
int元素；
t类型*阵列指针；
CuArray（整数行，整数列=1）
{
这->行=行；
此->列=列；
元素=此->行*此->列；
cudamaloc（&this->ArrayPointer，sizeof（TType）*this->Elements）；
}
静态CuArray*GpuCreate（整数行，整数列=1）
{
CuArray*CuArray=新CuArray（行、列）；
返回阵列；
}
};
我是cuda编码的初学者，但你的论证似乎不错，因为如果我调用内核，我将从主机调用它，然后我可以将行和列作为参数传递到内核中。但是，这需要更多的参数。然而，您的解决方案并不能解决cudaMemcpy的问题。这将有助于理解为什么它不能在我的代码中工作。@JensHorstmann那么对于您的原始代码，您的期望是什么？什么似乎不起作用？你可能想提供更多关于你在做什么的细节。@JensHorstmann你是在说三个0吗？如果你认为他们错了，至少你需要提供你如何打印他们的代码。我不会打印它。我使用Nvidia的Nsight Eclipse IDE调试代码