Parallel processing 使用cudaMallocManaged时,不允许从全局函数调用_u主机_u函数

Parallel processing 使用cudaMallocManaged时,不允许从全局函数调用_u主机_u函数,parallel-processing,cuda,Parallel Processing,Cuda,我有一个编写的代码,我正试图修改,以使其使用CUDA,我有很多麻烦,目前,我正试图使我想成为内核函数的函数是无效的,我得到了一些错误 以下是我收到的错误列表: black_scholes.cu(54): error: calling a __host__ function("cudaMallocManaged<double> ") from a __global__ function("black_scholes_iterate") is no

我有一个编写的代码,我正试图修改,以使其使用CUDA,我有很多麻烦,目前,我正试图使我想成为内核函数的函数是无效的,我得到了一些错误

以下是我收到的错误列表:

black_scholes.cu(54): error: calling a __host__ function("cudaMallocManaged<double> ") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(54): error: identifier "cudaMallocManaged<double> " is undefined in device code

black_scholes.cu(56): error: calling a __host__ function("init_gaussrand_state") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(56): error: identifier "init_gaussrand_state" is undefined in device code

black_scholes.cu(65): error: calling a __host__ function("spawn_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(65): error: identifier "spawn_prng_stream" is undefined in device code

black_scholes.cu(66): error: calling a __host__ function("gaussrand1") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(66): error: identifier "gaussrand1" is undefined in device code

black_scholes.cu(66): error: identifier "uniform_random_double" is undefined in device code

black_scholes.cu(73): error: calling a __host__ function("free_prng_stream") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(73): error: identifier "free_prng_stream" is undefined in device code

black_scholes.cu(74): error: calling a __host__ function("cudaFree") from a __global__ function("black_scholes_iterate") is not allowed

black_scholes.cu(74): error: identifier "cudaFree" is undefined in device code

任何有助于理解正在发生的事情都将不胜感激,这似乎是一个反复出现的主题。

目前,无法在CUDA设备代码中调用
cudaMallocManaged
。这从来都不可能。我不相信有NVIDIA培训材料演示在设备代码中使用
cudamalocmanaged


如果希望进行内核内分配,我建议使用中描述的方法。另外,
new
delete
的工作原理与
malloc()
free()
在内核中的使用类似。

我想当时我把它弄糊涂了,在内核中进行分配会比在主机代码中使用cudaMallocManaged更好吗?我通常不建议在内核中进行分配,而不是通过主机API调用进行类似的设备分配(例如
cudaMalloc
cudaMallocManaged
)。这有几个原因,包括性能和易用性。在
cuda
标签上有许多问题,它们讨论了在内核分配中使用的各个方面。在某些特定的用例中,内核内分配是有意义的。非常感谢Robert!
#include "black_scholes.h"
#include "gaussian.h"
#include "random.h" 
#include "util.h"
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

__managed__ double stddev;

__global__ void black_scholes_stddev (void* the_args)
{

  black_scholes_args_t* args = (black_scholes_args_t*) the_args;
  const double mean = args->mean;
  const int M = args->M;
  double variance = 0.0;
  int k = blockIdx.x * blockDim.x + threadIdx.x;

  if(k<M)
  {
   const double diff = args->trials[k] - mean;
   variance += diff * diff / (double) M;
  }

  args->variance = variance;
  stddev=sqrt(variance);

}


__global__ void black_scholes_iterate (void* the_args)
{

  black_scholes_args_t* args = (black_scholes_args_t*) the_args;

  const int S = args->S;
  const int E = args->E;
  const int M = args->M;
  const double r = args->r;
  const double sigma = args->sigma;
  const double T = args->T;

  double* trials = args->trials;
  double mean = 0.0;

  gaussrand_state_t gaussrand_state;
  void* prng_stream = NULL; 

double *randnumbs;
cudaMallocManaged(&randnumbs, M * sizeof (double));

init_gaussrand_state (&gaussrand_state);

int i = blockIdx.x * blockDim.x + threadIdx.x;
int k = blockIdx.x * blockDim.x + threadIdx.x;


//for (int i = 0; i < M; i++)
if(i<M)
{
  prng_stream = spawn_prng_stream(i%4);
  const double gaussian_random_number = gaussrand1 (&uniform_random_double, prng_stream, &gaussrand_state);
  randnumbs[i]=gaussian_random_number;
  const double current_value = S * exp ( (r - (sigma*sigma) / 2.0) * T + sigma * sqrt (T) * randnumbs[k]);
  trials[k] = exp (-r * T) * ((current_value - E < 0.0) ? 0.0 : current_value - E);
   mean += trials[k] / (double) M;//needs to be shared
  args->mean = mean;
}
  free_prng_stream (prng_stream);
  cudaFree(randnumbs);
}



void black_scholes (confidence_interval_t* interval,
           const double S,
           const double E,
           const double r,
           const double sigma,
           const double T,
           const int M,
         const int n)
{
  black_scholes_args_t args;
  double mean = 0.0;
  double conf_width = 0.0;
  double* trials = NULL;

  assert (M > 0);
  trials = (double*) malloc (M * sizeof (double));
  assert (trials != NULL);

  args.S = S;
  args.E = E;
  args.r = r;
  args.sigma = sigma;
  args.T = T;
  args.M = M;
  args.trials = trials;
  args.mean = 0.0;
  args.variance = 0.0;

  (void)black_scholes_iterate<<<1,1>>>(&args);
  mean = args.mean;
  black_scholes_stddev<<<1,1>>> (&args);
  cudaDeviceSynchronize();

  conf_width = 1.96 * stddev / sqrt ((double) M);
  interval->min = mean - conf_width;
  interval->max = mean + conf_width;

  deinit_black_scholes_args (&args);
}


void deinit_black_scholes_args (black_scholes_args_t* args)
{
  if (args != NULL)
    if (args->trials != NULL)
      {
    free (args->trials);
    args->trials = NULL;
      }
}