Struct 将带有数组的结构数组加载到cuda

Struct 将带有数组的结构数组加载到cuda,struct,cuda,Struct,Cuda,我正在尝试创建一个数组结构,其中包含数组,并将它们加载到GPU上。我想我遵循了正确的步骤 使用malloc在CPU上创建结构 cudamaloc将数组添加到结构中 使用cudamalloc在GPU上创建结构 将CPU结构复制到GPU结构上 运行此代码时,只要不更改内核函数中的值p[I].c[0],它就会正常工作。如果我删除行p[I].c[0]=3.3然后输出预期结果。当我保持原样时,它会输出所有值的随机数。我希望能够使用内核函数更新数组中的值 有什么不对劲吗 这是我的密码: #include &

我正在尝试创建一个数组结构,其中包含数组,并将它们加载到GPU上。我想我遵循了正确的步骤

  • 使用malloc在CPU上创建结构
  • cudamaloc
    将数组添加到结构中
  • 使用cudamalloc在GPU上创建结构
  • 将CPU结构复制到GPU结构上 运行此代码时,只要不更改内核函数中的值
    p[I].c[0]
    ,它就会正常工作。如果我删除行
    p[I].c[0]=3.3然后输出预期结果。当我保持原样时,它会输出所有值的随机数。我希望能够使用内核函数更新数组中的值

    有什么不对劲吗

    这是我的密码:

    #include <stdio.h>
    #include <cuda_runtime.h>
    #include <iostream>
    #include <fstream>
    #include <sstream>
    #include <cstdio>
    #include <fcntl.h>
    #include <unistd.h>
    #include <assert.h>
    #include <omp.h>
    #include <vector>
    #include <sys/time.h>
    
        float cData[]
                    {
                            1,
                            2,
                            3,
                            4,
                            5,
                            6,
                            7,
                            8,
                            9,
                            10,
                            11,
                            12,
                            13,
                            14,
                            15,
                            16
                    };
        float dData[]
                    {
                            1,
                            2,
                            3,
                            4,
                            5,
                            6,
                            7,
                            8,
                            9,
                            10,
                            11,
                            12,
                            13,
                            14,
                            15,
                            16
                    };
    
        typedef struct
                {
                    float a, b;
                    float* c;
                    float* d;
                } point;
    
    __global__ void testKernel(point *p){
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        p[i].a = 1.1;
        p[i].b = 2.2;
        p[i].c[0] = 3.3;
    }
    
    void checkerror(cudaError_t error, char* descrp){
        if (error != 0){
    
            printf("%s error code: %d \n", descrp, error);
        }
    
    }
    
    extern "C" int main()
    {
        printf("starting gpuCode\n");
        int *dev_a;
                // set number of points
            int numPoints    = 16,
                gpuBlockSize = 4,
                pointSize    = sizeof(point),
                numBytes     = numPoints * pointSize,
                gpuGridSize  = numPoints / gpuBlockSize;
        cudaError_t err = cudaSuccess;
        printf("initialized variables\n");
                // allocate memory
            point *cpuPointArray,
                  *gpuPointArray,
                  *outPointArray;
            cpuPointArray = (point*)malloc(numBytes);  //create the cpuPointArray struct on the cpu
            outPointArray = (point*)malloc(numBytes);  //create the outPointArray struct on the cpu
            printf("load cpuPointArray struct with default values\n");
    
            for (int k=0; k<16; k++){
                err = cudaMalloc( (void**)&cpuPointArray[k].c, 16*sizeof(float) );
                checkerror(err, "assigning cuda pointer c");
                err = cudaMalloc( (void**)&cpuPointArray[k].d, 16*sizeof(float) );
                checkerror(err, "assigning cuda pointer d");
                cpuPointArray[k].a = 16;
                cpuPointArray[k].b = 16;
            }
    
    
            for (int k=0; k<16; k++){
                printf("top loop %d\n", k);
                err = cudaMemcpy(cpuPointArray[k].c, cData, 16*sizeof(float), cudaMemcpyHostToDevice);
                printf("after cdata\n");
                checkerror(err, "copying cdata to gpu array c" );
                err = cudaMemcpy(cpuPointArray[k].d, dData, 16*sizeof(float), cudaMemcpyHostToDevice);
                printf("after ddata\n");
                checkerror(err, "copying ddata to gpu array d");
                printf("bottom of loop %d\n", k);
            }
    
            err = cudaMalloc((void**)&gpuPointArray, numBytes);  //allocate memory on the gpu for the cpu point array
            checkerror(err, "allocating memory for gpuPointArray");
            err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); //copy the cpu point array onto the gpu
            checkerror(err, "copying cpuPointArray to gpuPointArray");
    
            printf("loaded the struct into the kernel\n");
    
            for(int i = 0; i < numPoints; ++i)
                    {
                        printf("point.a: %f, point.b: %f ************************\n",cpuPointArray[i].a,cpuPointArray[i].b);
    
                            printf("cuda mem location point.c: %d point.d: %d\n",&cpuPointArray[i].c, &cpuPointArray[i].d);
    
                    }
    
                // launch kernel
            testKernel<<<gpuGridSize,gpuBlockSize>>>(gpuPointArray);
    
            printf("returned the struct from the kernel\n");
            err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost);
            checkerror(err, "copying gpuPointArray to cpuPointArray");
            printf("after gpu copy to cpu\n");
            for (int k=0; k<16; k++){
                printf("creating memory on cpu for array c\n");
                outPointArray[k].c = (float*)malloc(16*sizeof(float));
                printf("creating memory on cpu for array d\n");
                outPointArray[k].d = (float*)malloc(16*sizeof(float));
                printf("copying memory values onto cpu array c\n");
                err = cudaMemcpy(outPointArray[k].c, cpuPointArray[k].c, 16*sizeof(float), cudaMemcpyDeviceToHost);
                checkerror(err, "copy array c from gpu to cpu");
                printf("copying memory values onto cpu array c\n");
                err = cudaMemcpy(outPointArray[k].d, cpuPointArray[k].d, 16*sizeof(float), cudaMemcpyDeviceToHost);
                checkerror(err, "copy array d from gpu to cpu");
                printf("bottom of loop %d\n", k);
            }
    
                // retrieve the results
    
            printf("testKernel results:\n");
            for(int i = 0; i < numPoints; ++i)
            {
                printf("point.a: %f, point.b: %f ************************\n",outPointArray[i].a,outPointArray[i].b);
                for (int j=0; j<16; j++){
                    printf("point.c: %f point.d: %f\n",outPointArray[i].c[j], outPointArray[i].d[j]);
                }
            }
    
                // deallocate memory
            free(cpuPointArray);
            cudaFree(gpuPointArray);
    
            return 0;
        }
    
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    #包括
    浮动cData[]
    {
    1.
    2.
    3.
    4.
    5.
    6.
    7.
    8.
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16
    };
    浮点dData[]
    {
    1.
    2.
    3.
    4.
    5.
    6.
    7.
    8.
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    16
    };
    类型定义结构
    {
    浮子a、b;
    浮点数*c;
    浮动*d;
    }点;
    __全局无效测试内核(点*p){
    int i=blockIdx.x*blockDim.x+threadIdx.x;
    p[i].a=1.1;
    p[i].b=2.2;
    p[i].c[0]=3.3;
    }
    无效检查错误(cudaError\u t error,char*descrp){
    如果(错误!=0){
    printf(“%s错误代码:%d\n”,描述,错误);
    }
    }
    外部“C”int main()
    {
    printf(“启动gpuCode\n”);
    国际开发署;
    //设定点数
    int numPoints=16,
    gpuBlockSize=4,
    pointSize=sizeof(点),
    numBytes=numPoints*pointSize,
    gpuGridSize=numPoints/gpuBlockSize;
    cudaError\u t err=cudaSuccess;
    printf(“初始化变量\n”);
    //分配内存
    点*cpuPointArray,
    *gpuPointArray,
    *输出点阵列;
    cpuPointArray=(point*)malloc(numBytes);//在cpu上创建cpuPointArray结构
    outPointArray=(point*)malloc(numBytes);//在cpu上创建outPointArray结构
    printf(“使用默认值加载cpuPointArray结构\n”);
    
    对于(int k=0;k,您可能将结构数组错误地复制到设备。请尝试更改:

    err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); 
    
    将来

    err = cudaMemcpy(gpuPointArray,cpuPointArray,numBytes, cudaMemcpyHostToDevice); 
    
    由于cpuPointArray的类型为point*,因此sizeof(cpuPointArray)将实际返回计算机上指针的大小。您需要的是结构的完整数组的大小。事实上,从设备复制回时,您甚至可以通过以下方式正确执行此操作:

    err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost);
    

    希望这能有所帮助!

    您可能将结构数组复制到设备中的操作不正确。请尝试更改:

    err = cudaMemcpy(gpuPointArray,cpuPointArray,sizeof(cpuPointArray), cudaMemcpyHostToDevice); 
    
    将来

    err = cudaMemcpy(gpuPointArray,cpuPointArray,numBytes, cudaMemcpyHostToDevice); 
    
    由于cpuPointArray的类型为point*,因此sizeof(cpuPointArray)将实际返回计算机上指针的大小。您需要的是结构的完整数组的大小。事实上,从设备复制回时,您甚至可以通过以下方式正确执行此操作:

    err = cudaMemcpy(outPointArray,gpuPointArray,numBytes, cudaMemcpyDeviceToHost);
    
    希望有帮助