cudaMemcpy中从设备到主机的参数错误无效

cudaMemcpy中从设备到主机的参数错误无效,cuda,gpu,Cuda,Gpu,我是CUDA/GPU新手,在将数据从设备复制回主机时遇到问题。我正在使用CUDA Toolkit 6.5为Jetson TK1开发。它成功构建,但在运行时出错。我的代码如下: //main.cu void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size); int main () { int data_length = 1024

我是CUDA/GPU新手,在将数据从设备复制回主机时遇到问题。我正在使用CUDA Toolkit 6.5为Jetson TK1开发。它成功构建,但在运行时出错。我的代码如下:

//main.cu
void allocate(double* const d_inputCurrent, double* signal, double* const d_outputCurrent, const size_t size);

int main () {
    int data_length = 1024000;
    const int length=512;
    const size_t size= length;

    double signalA[length], signalB[length], signalC[length];

for (int i=0; i<data_length; i++)
{

    double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
    double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;

    if(i==0)
    {
        for(int k=0; k<length; k++)
        {
            signalA[k]=v_ia[k];
            signalB[k]=v_ib[k];
            signalC[k]=v_ic[k];
        }
        i=length-1;
    }
    else
    {
        //allocate memory in GPU and kernel call for phase A
        allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

        checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
        signalA[length-1]=v_ia[i];

        //allocate memory in GPU and kernel call for phase B
        allocate(d_inputCurrentIb, signalB, d_outputCurrentIb, size);
        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

        checkCudaErrors(cudaMemcpy(signalB, d_outputCurrentIb, sizeof(double) * size, cudaMemcpyDeviceToHost));
        signalB[length-1]=v_ib[i];

        //allocate memory in GPU and kernel call for phase C;
        allocate(d_inputCurrentIc, signalC, d_outputCurrentIc, size);
        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

        checkCudaErrors(cudaMemcpy(signalC, d_outputCurrentIc, sizeof(double) * size, cudaMemcpyDeviceToHost));
        signalC[length-1]=v_ic[i];

        //memory cleaning
        checkCudaErrors(cudaFree(d_inputCurrentIa));
        checkCudaErrors(cudaFree(d_inputCurrentIb));
        checkCudaErrors(cudaFree(d_inputCurrentIc));
        checkCudaErrors(cudaFree(d_outputCurrentIa));
        checkCudaErrors(cudaFree(d_outputCurrentIb));
        checkCudaErrors(cudaFree(d_outputCurrentIc));
    }
//main.cu
无效分配(双*常数d_输入电流、双*信号、双*常数d_输出电流、常数大小_大小);
int main(){
int data_length=1024000;
常数int长度=512;
常数大小=长度;
双信号A[长度]、信号B[长度]、信号C[长度];

对于C中的(int i=0;i),您不能通过值传递指向函数的指针,让该函数修改指针,然后期望该指针的修改显示在调用环境中:

double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;

...
    //allocate memory in GPU and kernel call for phase A

// at this point, d_inputCurrentIa and d_outputCurrentIa are pointing to nothing
    allocate(d_inputCurrentIa, signalA, d_outputCurrentIa, size);
// allocate modified those pointers internally, but the modified values don't show up here
    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

    checkCudaErrors(cudaMemcpy(signalA, d_outputCurrentIa, sizeof(double) * size, cudaMemcpyDeviceToHost));
// therefore you will get an error here, because d_outputCurrentIa still points to nothing
有多种方法可以实现此功能。一种方法是传递要修改和使用的指针的地址:

void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size);

double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;

...
    //allocate memory in GPU and kernel call for phase A
    allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size);
...
void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) {

    const dim3 blockSize(512);
    const dim3 gridSize(1);

    checkCudaErrors(cudaFree(0));

    checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size));
    checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size));

    checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size));

    checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));

    allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size);
    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
void分配(双**d_输入电流,双*信号,双**d_输出电流,常数大小);
双*d_输入电流IA、*d_输入电流IB、*d_输入电流IC;
双*d_输出电流IA、*d_输出电流IB、*d_输出电流IC;
...
//在GPU和内核调用中为阶段A分配内存
分配(&d_输入电流ia、信号A和&d_输出电流ia、大小);
...
无效分配(双**d\u输入电流,双*信号,双**d\u输出电流,常数大小){
常量dim3块大小(512);
常量dim3网格大小(1);
检查CUDAERRORS(cudaFree(0));
检查CUDAERRORS(cudaMalloc((void**)d_输入电流,大小(双)*大小));
检查CUDAERRORS(cudaMalloc((void**)d_输出电流,sizeof(double)*大小));
检查CUDAERRORS(cudaMemset(*d_outputCurrent,0,sizeof(double)*size));
检查CUDAERRORS(cudaMemcpy(*d_输入电流,信号,大小F(双)*大小,cudaMemcpyHostToDevice));
分配内核(*d_inputCurrent,*d_outputCurrent,size);
cudaDeviceSynchronize();检查CUDAErrors(cudaGetLastError());
}
注:

  • 不确定为什么要标记这些指针
    const
    。它们无论如何都不是
    const
    (函数将修改指针值及其指向的数据。)

  • 在浏览器中编码。您可能需要解决其他一些问题。由于您没有提供完整的代码,我也没有提供完整的代码。但这应该是一个路线图

  • 在函数中分配内存可能会导致内存泄漏。您可能需要考虑一下这一点。如果要重用或创建大量指针,请务必制定计划释放这些指针


  • 感谢您的快速回复@Robert。显然我不熟悉编码(尤其是指针:),只是想在很短的时间内找出一些GPU编程。我遵循了您的步骤,它成功了,但这次我遇到了“总线错误”在运行时。你说的一切都是对的,所以我删除了
    const
    ,在主函数中分配了所有内容,并在最后释放了指针。现在它运行得很好。如果有人感兴趣,我可以发布改进后的答案。
    void allocate(double** d_inputCurrent, double* signal, double **d_outputCurrent, const size_t size);
    
    double *d_inputCurrentIa, *d_inputCurrentIb, *d_inputCurrentIc;
    double *d_outputCurrentIa, *d_outputCurrentIb, *d_outputCurrentIc;
    
    ...
        //allocate memory in GPU and kernel call for phase A
        allocate(&d_inputCurrentIa, signalA, &d_outputCurrentIa, size);
    ...
    void allocate(double** d_inputCurrent, double* signal, double** d_outputCurrent, const size_t size) {
    
        const dim3 blockSize(512);
        const dim3 gridSize(1);
    
        checkCudaErrors(cudaFree(0));
    
        checkCudaErrors(cudaMalloc((void **)d_inputCurrent, sizeof(double) * size));
        checkCudaErrors(cudaMalloc((void **)d_outputCurrent, sizeof(double) * size));
    
        checkCudaErrors(cudaMemset(*d_outputCurrent, 0, sizeof(double) * size));
    
        checkCudaErrors(cudaMemcpy(*d_inputCurrent, signal, sizeof(double) * size, cudaMemcpyHostToDevice));
    
        allocate_kernel<<<gridSize, blockSize>>>(*d_inputCurrent, *d_outputCurrent, size);
        cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
    }