C++ cudaMemcpy之后的数组值_C++_C_Cuda

C++ cudaMemcpy之后的数组值

c++ c cuda

C++ cudaMemcpy之后的数组值,c++,c,cuda,C++,C,Cuda,我想知道，当我调用cudaMemcpy（…）获取GPU上的内存时，数组中的值是否也被复制。我将更好地解释：我将值从一个数组复制到另一个数组，然后调用cudamaloc和cudaMemcpy // Copying values of the arrays for(int i = 0; i<16; i++){ array_device_1[i] = array_host_1[i]; array_device_2[i] = array_host_2[i]; } // Memor

我想知道，当我调用cudaMemcpy（…）获取GPU上的内存时，数组中的值是否也被复制。我将更好地解释：我将值从一个数组复制到另一个数组，然后调用cudamaloc和cudaMemcpy

// Copying values of the arrays
for(int i = 0; i<16; i++){
    array_device_1[i] = array_host_1[i];
    array_device_2[i] = array_host_2[i];
}

// Memory allocation of array_device_1 and array_device_2
cudaMalloc((void**) &array_device_1, SIZE_INT*size);
cudaMalloc((void**) &array_device_2, SIZE_INT*size);

// Transfer array_device_1 and array_device_2
cudaMemcpy(array_device_1, array_host_1, SIZE_INT*size, cudaMemcpyHostToDevice);
cudaMemcpy(array_device_2, array_host_2, SIZE_INT*size, cudaMemcpyHostToDevice);

kernel<<<N, N>>>(array_device_1, array_device_2);

cudaMemcpy(array_host_1, array_device_1, SIZE_INT*size, cudaMemcpyDeviceToHost);
cudaMemcpy(array_host_2, array_device_2, SIZE_INT*size, cudaMemcpyDeviceToHost);

cudaFree(array_device_1);
cudaFree(array_device_2);

//复制数组的值
对于（int i=0；i是），它们的值在里面。但是您不能在主机上打印出来。为此，您需要使用
cudaMemcpy((void *) array_host_2, (void *) array_device_2, SIZE_INT*size, cudaMemcpyDeviceToHost);

然后可以打印array\u host\u 2
的值
再解释一下：您的array\u设备*
位于GPU和CPU上（即打印输出）您无法直接访问此数据。因此，您需要先将其复制回CPU内存，然后再将其打印出来。
除了leftaroundabout指出的前几行之外，您提供的代码片段似乎是正确的。您确定内核是正确的吗？也许您没有将修改后的值写回全局内存。如果在运行内核之前，创建另一组主机阵列并将GPU阵列复制回，它们是否正确？根据您现有的情况，阵列\u主机\u*内的值应已正确复制到阵列\u设备\u*中。
将带有数据的阵列复制到设备、更改内核中的值、复制回主机并打印新值的示例：
// Function to run on device by many threads
__global__ void myKernel(int *d_arr) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    d_arr[idx] = d_arr[idx]*2;
}

int main(void) {
    int *h_arr, *d_arr;
    h_arr = (int *)malloc(10*sizeof(int));
    for (int i=0; i<10; ++i)
        h_arr[i] = i; // Or other values

    // Sends data to device
    cudaMalloc((void**) &d_arr, 10*sizeof(int));
    cudaMemcpy(d_arr, h_arr, 10*sizeof(int), cudaMemcpyHostToDevice);

    // Runs kernel on device
    myKernel<<< 2, 5 >>>(d_arr);

    // Retrieves data from device 
    cudaMemcpy(h_arr, d_arr, 10*sizeof(int), cudaMemcpyDeviceToHost);

    for (int i = 0; i<10; ++i)
        printf("Post kernel value in h_arr[%d] is: %d\n", i,h_arr[i]);

    cudaFree(d_arr);
    free(h_arr);
    return 0;
}

//多线程在设备上运行的函数
__全局无效myKernel（int*d\u arr）{
int idx=blockIdx.x*blockDim.x+threadIdx.x；
d_arr[idx]=d_arr[idx]*2；
}
内部主（空）{
int*h_arr，*d_arr；
h_arr=（int*）malloc（10*sizeof（int））；
对于（int i=0；i（d_arr）；
//从设备检索数据
cudaMemcpy（h_arr，d_arr，10*sizeof（int），cudaMemcpyDeviceToHost）；
对于（int i=0；i，您可以使用内核函数在GPU内存上直接打印值。use可以使用类似于：
__global__ void printFunc(int *devArray){
      printf("%d", devArray[0]);
} 

希望有帮助。
你的//复制数组的值部分没有任何意义。你不能以这种方式将数据从主机复制到设备内存，这就是cudaMemcpy
的目的！我测试了我的内核，没有利用GPU部分，我的意思是我测试了算法，它工作得很好。我真的无法解释我遗漏了什么