如何从CUDA的“cudaMallocPitch”功能中应对“cudaErrorMissingConfiguration”?

如何从CUDA的“cudaMallocPitch”功能中应对“cudaErrorMissingConfiguration”?,cuda,gpu,Cuda,Gpu,我正在和CUDA制作一个Mandelbrot套装程序。但是,除非CUDA的cudaMallocPitch函数中的cudaErrorMissingConfiguration得到解决,否则我无法进行更多操作。你能告诉我一些情况吗 我的GPU是GeForce RTX 2060 SUPER 我将在下面向您展示我的命令行 > nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3 我尝试了cudaDeviceSetLimit cudaLimitMallocHeap

我正在和CUDA制作一个Mandelbrot套装程序。但是,除非CUDA的cudaMallocPitch函数中的cudaErrorMissingConfiguration得到解决,否则我无法进行更多操作。你能告诉我一些情况吗

我的GPU是GeForce RTX 2060 SUPER

我将在下面向您展示我的命令行

> nvcc MandelbrotCUDA.cu -o MandelbrotCUDA -O3
我尝试了cudaDeviceSetLimit cudaLimitMallocHeapSize,7*1024*1024*1024到 调整堆大小

cudaDeviceSetLimit成功了

然而,我不能再走一步。我无法打印CUDA malloc完成

#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;

#define D 0.0000025 // Tick
#define LIMIT_N 255 
#define INF_NUM 2

#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2 

__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.

    for(int i = 0; i < indexTotalY ; i++){
        for(int j = 0; j < indexTotalX; j++){ 
            thrust::complex<double> z(0.0f, 0.0f);
            n[i][j] = 0;
            for(int ctr=1;  ctr <= LIMIT_N ; ctr++){  
                z = z*z + (*(c[i][j]));
                n[i][j] = n[i][j] + (abs(z) < INF_NUM);
            }
        }
    }
}

int main(){

    // Data Path
    string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
    string fileName = "mandelbrot4.ppm";
    string filename = filePath+fileName;

    //complex<double> c[N][M];
    double xRange[2] = {-0.76, -0.74};
    double yRange[2] = {0.05, 0.1};

    const int indexTotalX = (xRange[1]-xRange[0])/D;
    const int indexTotalY = (yRange[1]-yRange[0])/D;

    thrust::complex<double> **c;
    //c = new complex<double> [N];
    cout << "debug_n" << endl;
    int **n;
    n = new int* [indexTotalY];
    c = new thrust::complex<double> * [indexTotalY];
    for(int i=0;i<indexTotalY;i++){
        n[i] = new int [indexTotalX];
        c[i] = new thrust::complex<double> [indexTotalX];
    }

    cout << "debug_n_end" << endl;

    for(int i = 0; i < indexTotalY; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
            c[i][j] = tmp;
            //n[i*sqrt(N)+j] = 0;
        }
    }

    // CUDA malloc
    cout << "CUDA malloc initializing..." << endl;  

    int **dN;
    thrust::complex<double> **dC;

    cudaError_t error;


    error = cudaDeviceSetLimit(cudaLimitMallocHeapSize, 7*1024*1024*1024);
    if(error != cudaSuccess){
        cout << "cudaDeviceSetLimit's ERROR CODE = " << error << endl;
        return 0;
    }

    size_t tmpPitch;
    error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
    if(error != cudaSuccess){
        cout << "CUDA ERROR CODE = " << error << endl;
        cout << "indexTotalX = " << indexTotalX << endl;
        cout << "indexTotalY = " << indexTotalY << endl;
        return 0;
    }

    cout << "CUDA malloc done!" << endl;

这里有几个问题:

int **dN;
...
error = cudaMallocPitch((void **)dN, &tmpPitch,(size_t)(indexTotalY*sizeof(int)), (size_t)(indexTotalX*sizeof(int)));
CUDA分配中使用的正确指针类型为单指针:

int *dN;
int **dN;
不是双指针:

int *dN;
int **dN;
因此,您正在尝试传递三个指针的内核:

void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.
对于cudaMallocPitch,只有水平请求的维度按数据元素的大小进行缩放。分配高度不会以这种方式缩放。另外,我将假设X对应于分配宽度,Y对应于分配高度,因此这些参数也会颠倒:

error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
cudaLimitMallocHeapSize不必进行设置,就可以实现这些功能。它只适用于。在8GB卡上保留7GB也可能导致问题。直到你确定你需要它,而不是你所展示的,我会简单地删除它

$ cat t1488.cu
#include <iostream>
#include <thrust/complex.h>
#include <fstream>
#include <string>
#include <stdlib.h>
using namespace std;

#define D 0.0000025 // Tick
#define LIMIT_N 255
#define INF_NUM 2

#define PLOT_METHOD 2 // dat file : 0, ppm file : 1, ppm file with C : 2

__global__
void calculation(const int indexTotalX, const int indexTotalY, int ***n, thrust::complex<double> ***c){ // n, c are the pointers of dN, dC.

    for(int i = 0; i < indexTotalY ; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> z(0.0f, 0.0f);
            n[i][j] = 0;
            for(int ctr=1;  ctr <= LIMIT_N ; ctr++){
                z = z*z + (*(c[i][j]));
                n[i][j] = n[i][j] + (abs(z) < INF_NUM);
            }
        }
    }
}

int main(){

    // Data Path
    string filePath = "Y:\\Documents\\Programming\\mandelbrot\\";
    string fileName = "mandelbrot4.ppm";
    string filename = filePath+fileName;

    //complex<double> c[N][M];
    double xRange[2] = {-0.76, -0.74};
    double yRange[2] = {0.05, 0.1};

    const int indexTotalX = (xRange[1]-xRange[0])/D;
    const int indexTotalY = (yRange[1]-yRange[0])/D;

    thrust::complex<double> **c;
    //c = new complex<double> [N];
    cout << "debug_n" << endl;
    int **n;
    n = new int* [indexTotalY];
    c = new thrust::complex<double> * [indexTotalY];
    for(int i=0;i<indexTotalY;i++){
        n[i] = new int [indexTotalX];
        c[i] = new thrust::complex<double> [indexTotalX];
    }

    cout << "debug_n_end" << endl;

    for(int i = 0; i < indexTotalY; i++){
        for(int j = 0; j < indexTotalX; j++){
            thrust::complex<double> tmp( xRange[0]+j*D, yRange[0]+i*D );
            c[i][j] = tmp;
            //n[i*sqrt(N)+j] = 0;
        }
    }

    // CUDA malloc
    cout << "CUDA malloc initializing..." << endl;

    int *dN;
    thrust::complex<double> **dC;

    cudaError_t error;


    size_t tmpPitch;
    error = cudaMallocPitch((void **)&dN, &tmpPitch,(size_t)(indexTotalX*sizeof(int)), (size_t)(indexTotalY));
    if(error != cudaSuccess){
        cout << "CUDA ERROR CODE = " << error << endl;
        cout << "indexTotalX = " << indexTotalX << endl;
        cout << "indexTotalY = " << indexTotalY << endl;
        return 0;
    }

    cout << "CUDA malloc done!" << endl;
}
$ nvcc -o t1488 t1488.cu
t1488.cu(68): warning: variable "dC" was declared but never referenced

$ cuda-memcheck ./t1488
========= CUDA-MEMCHECK
debug_n
debug_n_end
CUDA malloc initializing...
CUDA malloc done!
========= ERROR SUMMARY: 0 errors
$

请您解释一下,您将这个问题标记为并行处理的想法是从何而来的?内核代码覆盖了复杂平面的整个域,因为内核代码的每个GPU执行线程都覆盖了线程局部可迭代范围&再次覆盖了复杂平面有界正方形中8k x 20k点-[x,Y]的整个域。此外,主机端代码以纯[串行]顺序方式预加载所有类似阵列的域?哪有迹象表明至少有人试图让发布代码的某个部分以任何程度的并行性运行?我有什么想法吗?