cuda从设备内存复制内核中动态malloc的数据_Cuda_Malloc

cuda从设备内存复制内核中动态malloc的数据

cuda

cuda从设备内存复制内核中动态malloc的数据,cuda,malloc,Cuda,Malloc,我遇到了一个关于将cudaMemcpy与cudaMemcpyDeviceToHost一起使用的问题有一个结构有一个指针int*a，它将在内核函数中malloc。然后我需要把这个int*a复制到主机内存中我的问题是：我不知道使用cudaMemcpy它怎么会不起作用这是我的密码： #include <cuda_runtime.h> #include <stdio.h> typedef struct { int n, m; int *a; } myst; __glo

我遇到了一个关于将cudaMemcpy与cudaMemcpyDeviceToHost一起使用的问题

有一个结构有一个指针int*a，它将在内核函数中malloc。然后我需要把这个int*a复制到主机内存中

我的问题是：我不知道使用cudaMemcpy它怎么会不起作用

这是我的密码：

#include <cuda_runtime.h>
#include <stdio.h>

typedef struct { int n, m; int *a; } myst;

__global__ void xthread(myst *st)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    myst *mst = &st[idx];
    mst->n = idx;
    mst->m = idx+1;
    mst->a = (int *)malloc((mst->m)*sizeof(int));
    mst->a[0] = idx;
}


int main(int argc,char **argv)
{
    dim3 dimGrid(1);
    dim3 dimBlock(2);

    myst *mst = NULL;
    myst *hst = (myst *)malloc(2 * sizeof(myst));
    cudaMalloc(&mst, 2 * sizeof(myst));

    xthread<<<dimGrid, dimBlock>>>(mst);
    cudaDeviceSynchronize();

    cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
    cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);

    int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
    int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;

    cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
    printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);

    free(pInt1);
    free(pInt2);

    return 0;
}

#包括
#包括
typedef结构{int n，m；int*a；}myst；
__全局无效xthread（myst*st）
{
unsigned int idx=blockIdx.x*blockDim.x+threadIdx.x；
myst*mst=&st[idx]；
mst->n=idx；
mst->m=idx+1；
mst->a=（int*）malloc（（mst->m）*sizeof（int））；
mst->a[0]=idx；
}
int main（int argc，字符**argv）
{
dim3 dimGrid（1）；
dim3 dimBlock（2）；
myst*mst=NULL；
myst*hst=（myst*）malloc（2*sizeof（myst））；
Cudamaloc（和mst，2*sizeof（myst））；
xthread（mst）；
cudaDeviceSynchronize（）；
cudaMemcpy（&hst[0]、&mst[0]、sizeof（myst）、cudaMemcpyDeviceToHost）；
cudaMemcpy（&hst[1]、&mst[1]、sizeof（myst）、cudaMemcpyDeviceToHost）；
int*pInt1=（int*）malloc（（hst[0].m）*sizeof（int））；
int*pInt2=（int*）malloc（（hst[1].m）*sizeof（int））；
cudaMemcpy（pInt1，hst[0].a，（hst[0].m）*sizeof（int），cudaMemcpyDeviceToHost）；
cudaMemcpy（pInt2，hst[1].a，（hst[1].m）*sizeof（int），cudaMemcpyDeviceToHost）；
printf（“%d\t%d\t%d\n”，hst[0].n，hst[0].m，pInt1[0]）；
printf（“%d\t%d\t%d\n”，hst[1].n，hst[1].m，pInt2[0]）；
免费（pInt1）；
免费（品脱2）；
返回0；
}

代码将发出关于“检测到Cuda API错误：cudaMemcpy返回（0xb）”的警告

我看到了一个类似的问题：但这似乎不能解决我的问题

Thx.

好吧，我用一种愚蠢的方法（-.-！！）解决了这个问题

当从内核函数返回时，我计算主机和设备中有多少空间需要malloc，而cudaMalloc又是一个大空间。接下来，在另一个名为ythread的内核函数中，将堆中的数据复制到大空间中

typedef struct { int n, m; int *a; } myst; __global__ void xthread(myst *st) { unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; myst *mst = &st[idx]; mst->n = idx; mst->m = idx + 1; mst->a = (int *) malloc((mst->m) * sizeof(int)); for (int i = 0; i < mst->m; i++) { mst->a[i] = idx + 900 + i * 10; } } __global__ void ythread(myst *st, int *total_a) { unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x; myst *mst = &st[idx]; int offset=0; for(int i=0; i<idx; i++) { offset += st[i].m; } for(int i=0; i<mst->m; i++) { total_a[offset+i] = mst->a[i]; } } int main(int argc,char **argv) { dim3 dimGrid(1); dim3 dimBlock(2); myst *mst = NULL; cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst)); xthread<<<dimGrid, dimBlock>>>(mst); cudaDeviceSynchronize(); myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst)); cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost); int t_size = 0; for(int i=0; i<dimBlock.x; i++) { t_size += hst[i].m; } printf("t_size:%d\n", t_size); int * t_a_h = (int *)malloc(t_size*sizeof(int)); int * t_a_d = NULL; cudaMalloc((void**)&t_a_d, t_size*sizeof(int)); ythread<<<dimGrid, dimBlock>>>(mst, t_a_d); cudaDeviceSynchronize(); cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost); for(int i=0; i<t_size; i++) { printf("t_a_h[%d]:%d\n", i, t_a_h[i]); } free(t_a_h); cudaFree(mst); cudaFree(t_a_d); return 0; }

typedef结构{intn，m；int*a；}myst； __全局无效xthread（myst*st）{ unsigned int idx=blockIdx.x*blockDim.x+threadIdx.x； myst*mst=&st[idx]； mst->n=idx； mst->m=idx+1； mst->a=（int*）malloc（（mst->m）*sizeof（int））；对于（int i=0；im；i++）{ mst->a[i]=idx+900+i*10； } } __全局读取（myst*st，int*total\u a）{ unsigned int idx=blockIdx.x*blockDim.x+threadIdx.x； myst*mst=&st[idx]；整数偏移=0；对于（int i=0；ia[i]； } } int main（int argc，字符**argv）{ dim3 dimGrid（1）； dim3 dimBlock（2）； myst*mst=NULL； Cudamaloc（（void**）和mst，dimBlock.x*sizeof（myst））； xthread（mst）； cudaDeviceSynchronize（）； myst*hst=（myst*）malloc（dimBlock.x*sizeof（myst））； cudaMemcpy（hst、mst、dimBlock.x*sizeof（myst）、cudamemcpydevicetoost）； int t_size=0；对于（int i=0；iYou不能这样做。不支持对设备堆的主机访问。Thx，那么，如何解决此问题？