Cuda中的嵌套并行_Cuda_Parallel Processing_Gpu_Nested Loops

Cuda中的嵌套并行

cuda parallel-processing

Cuda中的嵌套并行,cuda,parallel-processing,gpu,nested-loops,Cuda,Parallel Processing,Gpu,Nested Loops,在下面的代码中，我想使用嵌套并行性来计算数组元素的10倍。我使用这个简单的示例来了解Cuda中的动态并行性。代码的工作方式是，对于parentArray的每个元素，有另一个内核将该元素保存在childArray的某个位置（0到9）。对于parentArray的每个元素，我有另一个包含10个元素的数组，每个元素都等于parentArray的元素。最后，我计算所有ChildArray的总和，并将结果保存在parentArray中因此，结果应该是： parentArray的元素0，结果=0 pare

在下面的代码中，我想使用嵌套并行性来计算数组元素的10倍。我使用这个简单的示例来了解Cuda中的动态并行性。代码的工作方式是，对于parentArray的每个元素，有另一个内核将该元素保存在childArray的某个位置（0到9）。对于parentArray的每个元素，我有另一个包含10个元素的数组，每个元素都等于parentArray的元素。最后，我计算所有ChildArray的总和，并将结果保存在parentArray中

因此，结果应该是：

parentArray的元素0，结果=0
parentArray的元素1，结果=10
parentArray的元素2，结果=20，依此类推

目前，代码已编译，但未给出预期结果。当前代码有什么问题

计算元素和的函数

__device__ double summe(double *arr, int size)
{
  double result = 0.0;
  for(int i = 0; i < size; i++)
  {
    result += arr[i];
  }
  return result;
}

存储结果的数组

__device__ double childArr[10];

子内核

__device__ double getElement(double arrElement)
{
  return arrElement;
}

__global__ void childKernel(double *arr, double arrElement,int N)
{
  int cidx = blockIdx.x * blockDim.x + threadIdx.x;
  if (cidx < N)
  {
    arr[cidx] = getElement(arrElement);
  }
}

__global__ void parentKernel(double *parentArray, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N)
  {
    childKernel<<<1,10>>>(childArr,parentArray[idx],N);
    __syncthreads();
    parentArray[idx] = summe(childArr,10);

  }

}

此时将启动10个内核（每个子内核也有10个线程），10个活动父内核线程中的每一个都有一个：

childKernel<<<1,10>>>(childArr,parentArray[idx],N);

这将是不可预测的

避免争用条件的一种可能方法是让每个子内核写入

childArr

的单独部分

另一个问题是在内核中使用

\uu syncthreads（）

而不是

cudaDeviceSynchronize（）

作为屏障。内核启动，无论是从主机代码还是从设备代码启动，都是异步的，

\uu syncthreads（）

不保证异步启动之前的工作已经完成

cudaDeviceSynchronize（）

导致调用线程暂停，直到该线程之前启动的所有内核完成。（见下文注释）

通过这两项更改，您的代码可以生成您期望的输出：

$ cat t11.cu
#include <stdio.h>
#define CUDA_CALL(x) x
#define MY_M 10
#define MY_N 10

__device__ double childArr[MY_M*MY_N];

__device__ double summe(double *arr, int size)
{
  double result = 0.0;
  for(int i = 0; i < size; i++)
  {
    result += arr[i];
  }
  return result;
}

__device__ double getElement(double arrElement)
{
  return arrElement;
}

__global__ void childKernel(double *arr, double arrElement,int N)
{
  int cidx = blockIdx.x * blockDim.x + threadIdx.x;
  if (cidx < N)
  {
    arr[cidx] = getElement(arrElement);
  }
}

__global__ void parentKernel(double *parentArray, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N)
  {
    childKernel<<<1,MY_M>>>(childArr+MY_M*idx,parentArray[idx],N);
    cudaDeviceSynchronize();
    parentArray[idx] = summe(childArr+MY_M*idx,MY_M);

  }

}

int main(void)
    {

      double *host_array;
      double *device_array;

      // Number of elements in arrays
      const int N_array = MY_N;

      // size of array
      const size_t size_array = N_array * sizeof(double);

      // Allocate array on host
      host_array = (double *)malloc(size_array);

      // Allocate array on device
      CUDA_CALL(cudaMalloc((void **) &device_array, size_array));

      // Initialize host array
      for (int i=0; i<N_array; i++)
      {
        host_array[i] = (double)i;
      }

      // and copy it to CUDA device
      CUDA_CALL(cudaMemcpy(device_array, host_array, size_array, cudaMemcpyHostToDevice));

      // Do calculation on device:
      int block_size = 4;
      // if N = 10, then n_blocks = 3
      int n_blocks = N_array/block_size + (N_array % block_size == 0 ? 0:1);

      parentKernel<<<n_blocks, block_size>>>(device_array,N_array);

      // Retrieve result from device and store it in host array
      CUDA_CALL(cudaMemcpy(host_array, device_array, sizeof(double)*N_array, cudaMemcpyDeviceToHost));

  // Print results
  for (int i=0; i<N_array; i++)
  {
    printf("Element %d of parentArray, Result = %f\n", i, host_array[i]);
  }

  // Cleanup
  free(host_array);
  CUDA_CALL(cudaFree(device_array));

}


$ nvcc -arch=sm_52 -rdc=true -o t11 t11.cu -lcudadevrt
$ cuda-memcheck ./t11
========= CUDA-MEMCHECK
Element 0 of parentArray, Result = 0.000000
Element 1 of parentArray, Result = 10.000000
Element 2 of parentArray, Result = 20.000000
Element 3 of parentArray, Result = 30.000000
Element 4 of parentArray, Result = 40.000000
Element 5 of parentArray, Result = 50.000000
Element 6 of parentArray, Result = 60.000000
Element 7 of parentArray, Result = 70.000000
Element 8 of parentArray, Result = 80.000000
Element 9 of parentArray, Result = 90.000000
========= ERROR SUMMARY: 0 errors
$

$cat t11.cu
#包括
#定义CUDA_调用（x）x
#定义我的M 10
#定义我的10
__设备uuuuuuuuuuuuuuuuuuuuu双孩子arr[我的妈妈*我的妈妈]；
__设备\uuuuuuuuuuuuuuuuuuuuuuuuuuu总和（双*arr，整数大小）
{
双结果=0.0；
对于（int i=0；i对于（int i=0；i）您得到了什么结果？全部为0？请编辑您的GPU详细信息和用于将代码编译到您的GPU中的命令question@Kenney如果我想在childKernel（本例中的函数getElement）中使用随机数，则结果是加法的，那么我是否应该为parentKernel中的CurandState调用一个安装内核？谢谢！你可以。或者你可以在启动父/子内核之前在主机上调用一个安装内核。或者你可以只在子内核中进行安装，如演示所示。好的，谢谢，但它说由于性能原因，不应该在内核中创建状态丢失。如果能在这里看到一些关于我的问题的代码，getElement（）会生成一个随机数，那就太好了。也许我会发布另一个关于这个问题的问题？上面写着“创建和初始化一个新的本地状态”将降低性能。每个线程必须创建一次状态，这是不可避免的。您不想做的是为单个线程反复创建状态。事实上，我在回答中指出我已链接。是的，如果您有新问题，我建议创建一个新问题。要清楚，我建议创建一个新问题，而不是编辑这个。
childKernel<<<1,10>>>(childArr,parentArray[idx],N);

__syncthreads();

$ cat t11.cu
#include <stdio.h>
#define CUDA_CALL(x) x
#define MY_M 10
#define MY_N 10

__device__ double childArr[MY_M*MY_N];

__device__ double summe(double *arr, int size)
{
  double result = 0.0;
  for(int i = 0; i < size; i++)
  {
    result += arr[i];
  }
  return result;
}

__device__ double getElement(double arrElement)
{
  return arrElement;
}

__global__ void childKernel(double *arr, double arrElement,int N)
{
  int cidx = blockIdx.x * blockDim.x + threadIdx.x;
  if (cidx < N)
  {
    arr[cidx] = getElement(arrElement);
  }
}

__global__ void parentKernel(double *parentArray, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < N)
  {
    childKernel<<<1,MY_M>>>(childArr+MY_M*idx,parentArray[idx],N);
    cudaDeviceSynchronize();
    parentArray[idx] = summe(childArr+MY_M*idx,MY_M);

  }

}

int main(void)
    {

      double *host_array;
      double *device_array;

      // Number of elements in arrays
      const int N_array = MY_N;

      // size of array
      const size_t size_array = N_array * sizeof(double);

      // Allocate array on host
      host_array = (double *)malloc(size_array);

      // Allocate array on device
      CUDA_CALL(cudaMalloc((void **) &device_array, size_array));

      // Initialize host array
      for (int i=0; i<N_array; i++)
      {
        host_array[i] = (double)i;
      }

      // and copy it to CUDA device
      CUDA_CALL(cudaMemcpy(device_array, host_array, size_array, cudaMemcpyHostToDevice));

      // Do calculation on device:
      int block_size = 4;
      // if N = 10, then n_blocks = 3
      int n_blocks = N_array/block_size + (N_array % block_size == 0 ? 0:1);

      parentKernel<<<n_blocks, block_size>>>(device_array,N_array);

      // Retrieve result from device and store it in host array
      CUDA_CALL(cudaMemcpy(host_array, device_array, sizeof(double)*N_array, cudaMemcpyDeviceToHost));

  // Print results
  for (int i=0; i<N_array; i++)
  {
    printf("Element %d of parentArray, Result = %f\n", i, host_array[i]);
  }

  // Cleanup
  free(host_array);
  CUDA_CALL(cudaFree(device_array));

}


$ nvcc -arch=sm_52 -rdc=true -o t11 t11.cu -lcudadevrt
$ cuda-memcheck ./t11
========= CUDA-MEMCHECK
Element 0 of parentArray, Result = 0.000000
Element 1 of parentArray, Result = 10.000000
Element 2 of parentArray, Result = 20.000000
Element 3 of parentArray, Result = 30.000000
Element 4 of parentArray, Result = 40.000000
Element 5 of parentArray, Result = 50.000000
Element 6 of parentArray, Result = 60.000000
Element 7 of parentArray, Result = 70.000000
Element 8 of parentArray, Result = 80.000000
Element 9 of parentArray, Result = 90.000000
========= ERROR SUMMARY: 0 errors
$