多次执行后CUDA程序的结果不一致 描述

多次执行后CUDA程序的结果不一致 描述,c,cuda,C,Cuda,我尝试在GPU上用2D网格和2D块进行矩阵求和,在执行了几次程序后得到了不同的结果。对此行为的任何解释或修复都会有所帮助,谢谢 细节 大多数情况下,CPU上的结果与GPU上的结果一致。但有时(比如,在操作系统启动之后),程序会告诉我们结果不一致。但之后的所有执行都将产生一致的结果(而且运行速度似乎更快)。 我还没有找到一种保证复制这种行为的方法。我再次尝试重新启动操作系统,但程序的第一次执行产生了一致的结果 代码 主函数在CPU和GPU上对两个2^10×2^10的矩阵执行求和(使用2^5×2^5

我尝试在GPU上用2D网格和2D块进行矩阵求和,在执行了几次程序后得到了不同的结果。对此行为的任何解释或修复都会有所帮助,谢谢

细节 大多数情况下,CPU上的结果与GPU上的结果一致。但有时(比如,在操作系统启动之后),程序会告诉我们结果不一致。但之后的所有执行都将产生一致的结果(而且运行速度似乎更快)。 我还没有找到一种保证复制这种行为的方法。我再次尝试重新启动操作系统,但程序的第一次执行产生了一致的结果

代码 主函数在CPU和GPU上对两个2^10×2^10的矩阵执行求和(使用2^5×2^5网格和2^5×2^5块),并比较结果

#include "stdio.h"
#define FALSE 0
#define TRUE !FALSE
double *mallocMatrix(const int row, const int column)
{
    return (double*)malloc(row*column*sizeof(double));
}

void matrixInit(double *matrix, const int row, const int column)
{
    ;
}


int matEqual(double *mat1, double *mat2, const int row, const int column)
{
    for(int i=0;i<row;i++)
    {
        for(int j=0;j<column;j++)
        {
            int k=i*column+j;
            if(mat1[k]!=mat2[k])
            {
                printf("Entry %d doens't match.\n",k);
                return FALSE;
            }
        }
    }
    return TRUE;
}

void matrixSumCpu(double *m1, double *m2, double *n, const int row, const int column)
{
    for(int i=0; i<row; i++)
    {
        for(int j=0; j<column; j++)
        {
            int k = i * column + j;
            n[k]=m1[k]+m2[k];
        }
    }
}

__global__ void _2dGrid2dBlockMatSum(double *m1, double *m2, double *n, const int row, const int column)
{
    int rowIndex=blockIdx.x*blockDim.x+threadIdx.x;
    int columnIndex=blockIdx.y*blockDim.y+threadIdx.y;
    if(rowIndex<row&&columnIndex<column)
    {
        int i=rowIndex*column+columnIndex;//flatten
        n[i]=m1[i]+m2[i];
    }
}


void checkGpuMalloc(cudaError_t code)
{
    if(code != cudaSuccess)
    {
        exit(-1);
        printf("CUDA ERROR occured. ");
    }
}

void printMatrix(double *mat, const int row, const int column)
{
    const int rowToPrint=3;
    const int columnToPrint=6;
    for(int i=0;i<rowToPrint;i++)
    {
        for(int j=0;j<columnToPrint;j++)
            printf("%lf", mat[i*column+j]);
        if(column>columnToPrint)
            printf("...");
        printf("\n");
    }
    if(row>rowToPrint)
        printf("...\n");
}

int main()
{
    int row=1<<10, column=1<<10;
    double *h_m1=NULL, *h_m2=NULL,*h_n1=NULL, *h_n2=NULL;//n=m1+m2
    h_m1=mallocMatrix(row, column);
    h_m2=mallocMatrix(row, column);
    h_n1=mallocMatrix(row, column);
    h_n2=mallocMatrix(row, column);
    if(h_m1==NULL||h_m2==NULL||h_n1==NULL||h_n2==NULL)
    {
        printf("Unable to allocate enough memory on CPU\n");
        exit(-1);
    }
    matrixInit(h_m1,row,column);
    matrixInit(h_m2,row,column);
    printf("Summing matrices on CPU...\n");
    matrixSumCpu(h_m1,h_m2,h_n1,row,column);
    double *d_m1=NULL, *d_m2=NULL, *d_n=NULL;
    checkGpuMalloc(cudaMalloc((void**)&d_m1, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_m2, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_n, row*column*sizeof(double)));
    cudaMemcpy(d_m1, h_m1, row*column*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_m2, h_m2, row*column*sizeof(double), cudaMemcpyHostToDevice);
    printf("Summing matrices on GPU with 2D grid and 2D blocks.\n");
    _2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
    cudaDeviceSynchronize();    
    cudaMemcpy(h_n2, d_n, row*column*sizeof(double), cudaMemcpyDeviceToHost);
    if(matEqual(h_n1, h_n2, row, column))
        printf("Matrices match.\n");
    else
    {
        printf("Matrices don't match.\nResult on CPU:\n");
        printMatrix(h_n1, row, column);
        printf("Result on GPU:");
        printMatrix(h_n2, row, column);
    }
    free(h_m1);
    free(h_m2);
    free(h_n1);
    free(h_n2);
    cudaFree(d_m1);
    cudaFree(d_m2);
    cudaFree(d_n);
    cudaDeviceReset();
    return 0;
}
#包括“stdio.h”
#定义FALSE 0
#定义为真!错误的
double*mallocMatrix(常数整型行,常数整型列)
{
返回(双精度*)malloc(行*列*大小(双精度));
}
void matrixInit(双*矩阵,常量整数行,常量整数列)
{
;
}
int matEqual(双*mat1,双*mat2,常量int行,常量int列)
{

对于(int i=0;i这与您认为的不同,当我编译代码时,编译器会在这一行发出警告:

_2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
并在内核调用之前添加以下行:

cudaMemset(d_n, 0, row*column*sizeof(double));
然后编译并运行它,它就会失败

然后,按照我的建议更改
dim3
,它就会修复它

下面是一个固定的示例:

#include "stdio.h"
#define FALSE 0
#define TRUE !FALSE
double *mallocMatrix(const int row, const int column)
{
    return (double*)malloc(row*column*sizeof(double));
}

void matrixInit(double *matrix, const int row, const int column)
{
    for (int i = 0; i < row; i++)
      for (int j = 0; j < column; j++)
        matrix[i*column+j] = 1;
}


int matEqual(double *mat1, double *mat2, const int row, const int column)
{
    for(int i=0;i<row;i++)
    {
        for(int j=0;j<column;j++)
        {
            int k=i*column+j;
            if(mat1[k]!=mat2[k])
            {
                printf("Entry %d doens't match.\n",k);
                return FALSE;
            }
        }
    }
    return TRUE;
}

void matrixSumCpu(double *m1, double *m2, double *n, const int row, const int column)
{
    for(int i=0; i<row; i++)
    {
        for(int j=0; j<column; j++)
        {
            int k = i * column + j;
            n[k]=m1[k]+m2[k];
        }
    }
}

__global__ void _2dGrid2dBlockMatSum(double *m1, double *m2, double *n, const int row, const int column)
{
    int rowIndex=blockIdx.x*blockDim.x+threadIdx.x;
    int columnIndex=blockIdx.y*blockDim.y+threadIdx.y;
    if(rowIndex<row&&columnIndex<column)
    {
        int i=rowIndex*column+columnIndex;//flatten
        n[i]=m1[i]+m2[i];
    }
}


void checkGpuMalloc(cudaError_t code)
{
    if(code != cudaSuccess)
    {
        exit(-1);
        printf("CUDA ERROR occured. ");
    }
}

void printMatrix(double *mat, const int row, const int column)
{
    const int rowToPrint=3;
    const int columnToPrint=6;
    for(int i=0;i<rowToPrint;i++)
    {
        for(int j=0;j<columnToPrint;j++)
            printf("%lf", mat[i*column+j]);
        if(column>columnToPrint)
            printf("...");
        printf("\n");
    }
    if(row>rowToPrint)
        printf("...\n");
}

int main()
{
    int row=1<<10, column=1<<10;
    double *h_m1=NULL, *h_m2=NULL,*h_n1=NULL, *h_n2=NULL;//n=m1+m2
    h_m1=mallocMatrix(row, column);
    h_m2=mallocMatrix(row, column);
    h_n1=mallocMatrix(row, column);
    h_n2=mallocMatrix(row, column);
    if(h_m1==NULL||h_m2==NULL||h_n1==NULL||h_n2==NULL)
    {
        printf("Unable to allocate enough memory on CPU\n");
        exit(-1);
    }
    matrixInit(h_m1,row,column);
    matrixInit(h_m2,row,column);
    printf("Summing matrices on CPU...\n");
    matrixSumCpu(h_m1,h_m2,h_n1,row,column);
    double *d_m1=NULL, *d_m2=NULL, *d_n=NULL;
    checkGpuMalloc(cudaMalloc((void**)&d_m1, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_m2, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_n, row*column*sizeof(double)));
    cudaMemcpy(d_m1, h_m1, row*column*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_m2, h_m2, row*column*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemset(d_n, 0, row*column*sizeof(double));
    printf("Summing matrices on GPU with 2D grid and 2D blocks.\n");
    printf("%d\n", (1<<5,1<<5));
    _2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
    cudaDeviceSynchronize();
    cudaMemcpy(h_n2, d_n, row*column*sizeof(double), cudaMemcpyDeviceToHost);
    if(matEqual(h_n1, h_n2, row, column))
        printf("Matrices match.\n");
    else
    {
        printf("Matrices don't match.\nResult on CPU:\n");
        printMatrix(h_n1, row, column);
        printf("Result on GPU:");
        printMatrix(h_n2, row, column);
    }
    free(h_m1);
    free(h_m2);
    free(h_n1);
    free(h_n2);
    cudaFree(d_m1);
    cudaFree(d_m2);
    cudaFree(d_n);
    cudaDeviceReset();
    return 0;
}
#包括“stdio.h”
#定义FALSE 0
#定义真!假
double*mallocMatrix(常数整型行,常数整型列)
{
返回(双精度*)malloc(行*列*大小(双精度));
}
void matrixInit(双*矩阵,常量整数行,常量整数列)
{
对于(int i=0;i(1<<5,1<<5)
void matrixInit(double *matrix, const int row, const int column)
{
    for (int i = 0; i < row; i++)
      for (int j = 0; j < column; j++)
        matrix[i*column+j] = 1;
}
cudaMemset(d_n, 0, row*column*sizeof(double));
#include "stdio.h"
#define FALSE 0
#define TRUE !FALSE
double *mallocMatrix(const int row, const int column)
{
    return (double*)malloc(row*column*sizeof(double));
}

void matrixInit(double *matrix, const int row, const int column)
{
    for (int i = 0; i < row; i++)
      for (int j = 0; j < column; j++)
        matrix[i*column+j] = 1;
}


int matEqual(double *mat1, double *mat2, const int row, const int column)
{
    for(int i=0;i<row;i++)
    {
        for(int j=0;j<column;j++)
        {
            int k=i*column+j;
            if(mat1[k]!=mat2[k])
            {
                printf("Entry %d doens't match.\n",k);
                return FALSE;
            }
        }
    }
    return TRUE;
}

void matrixSumCpu(double *m1, double *m2, double *n, const int row, const int column)
{
    for(int i=0; i<row; i++)
    {
        for(int j=0; j<column; j++)
        {
            int k = i * column + j;
            n[k]=m1[k]+m2[k];
        }
    }
}

__global__ void _2dGrid2dBlockMatSum(double *m1, double *m2, double *n, const int row, const int column)
{
    int rowIndex=blockIdx.x*blockDim.x+threadIdx.x;
    int columnIndex=blockIdx.y*blockDim.y+threadIdx.y;
    if(rowIndex<row&&columnIndex<column)
    {
        int i=rowIndex*column+columnIndex;//flatten
        n[i]=m1[i]+m2[i];
    }
}


void checkGpuMalloc(cudaError_t code)
{
    if(code != cudaSuccess)
    {
        exit(-1);
        printf("CUDA ERROR occured. ");
    }
}

void printMatrix(double *mat, const int row, const int column)
{
    const int rowToPrint=3;
    const int columnToPrint=6;
    for(int i=0;i<rowToPrint;i++)
    {
        for(int j=0;j<columnToPrint;j++)
            printf("%lf", mat[i*column+j]);
        if(column>columnToPrint)
            printf("...");
        printf("\n");
    }
    if(row>rowToPrint)
        printf("...\n");
}

int main()
{
    int row=1<<10, column=1<<10;
    double *h_m1=NULL, *h_m2=NULL,*h_n1=NULL, *h_n2=NULL;//n=m1+m2
    h_m1=mallocMatrix(row, column);
    h_m2=mallocMatrix(row, column);
    h_n1=mallocMatrix(row, column);
    h_n2=mallocMatrix(row, column);
    if(h_m1==NULL||h_m2==NULL||h_n1==NULL||h_n2==NULL)
    {
        printf("Unable to allocate enough memory on CPU\n");
        exit(-1);
    }
    matrixInit(h_m1,row,column);
    matrixInit(h_m2,row,column);
    printf("Summing matrices on CPU...\n");
    matrixSumCpu(h_m1,h_m2,h_n1,row,column);
    double *d_m1=NULL, *d_m2=NULL, *d_n=NULL;
    checkGpuMalloc(cudaMalloc((void**)&d_m1, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_m2, row*column*sizeof(double)));
    checkGpuMalloc(cudaMalloc((void**)&d_n, row*column*sizeof(double)));
    cudaMemcpy(d_m1, h_m1, row*column*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(d_m2, h_m2, row*column*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemset(d_n, 0, row*column*sizeof(double));
    printf("Summing matrices on GPU with 2D grid and 2D blocks.\n");
    printf("%d\n", (1<<5,1<<5));
    _2dGrid2dBlockMatSum<<<(1<<5,1<<5),(1<<5, 1<<5)>>>(d_m1, d_m2, d_n, row, column);
    cudaDeviceSynchronize();
    cudaMemcpy(h_n2, d_n, row*column*sizeof(double), cudaMemcpyDeviceToHost);
    if(matEqual(h_n1, h_n2, row, column))
        printf("Matrices match.\n");
    else
    {
        printf("Matrices don't match.\nResult on CPU:\n");
        printMatrix(h_n1, row, column);
        printf("Result on GPU:");
        printMatrix(h_n2, row, column);
    }
    free(h_m1);
    free(h_m2);
    free(h_n1);
    free(h_n2);
    cudaFree(d_m1);
    cudaFree(d_m2);
    cudaFree(d_n);
    cudaDeviceReset();
    return 0;
}