GPU（CUDA）和CPU的计算结果不同_C_Cuda_Fractals

GPU（CUDA）和CPU的计算结果不同

c cuda

GPU（CUDA）和CPU的计算结果不同,c,cuda,fractals,C,Cuda,Fractals,我想创建一个在我的GPU上生成分形的程序。首先，我用C创建了一个工作项目，然后我尝试将其转换为CUDA/C 不幸的是，在我这么做之后，我发现CPU和GPU的结果有所不同我花了几个小时思考我做错了什么，这对我来说是个谜 IMO：似乎while循环中的计算值存在差异，因此它比正常的CPU函数结束得更早问：有没有可能是真的？如果，我能做些什么来避免这种计算错误呢这是我的全部代码： // C libs #include <stdint.h> #include <stdio.h&

我想创建一个在我的GPU上生成分形的程序。首先，我用C创建了一个工作项目，然后我尝试将其转换为CUDA/C

不幸的是，在我这么做之后，我发现CPU和GPU的结果有所不同

我花了几个小时思考我做错了什么，这对我来说是个谜

IMO：似乎while循环中的计算值存在差异，因此它比正常的CPU函数结束得更早

问：有没有可能是真的？如果，我能做些什么来避免这种计算错误呢

这是我的全部代码：

// C libs
#include <stdint.h>
#include <stdio.h>
#include <iostream>

// Help libs
#include <windows.h>
#include <math.h>

// CUDA libs
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__ void calulateFractal(unsigned char *a, int N, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    if(i < N)
    {
        int x = i % width;
        int y = i / width;

        double p_im = y * ratioY + minY;
        double p_re = x * ratioX + minX;

        double z_re = p_re;
        double z_im = p_im;

        int iteration = 0;

        while ((z_re * z_re + z_im * z_im) < 4 && iteration < maxLevel)
        {
            double tmp_re = z_re * z_re - z_im * z_im + c_re;
            double tmp_im = 2 * z_re * z_im + c_im;
            z_re = tmp_re;
            z_im = tmp_im;
            iteration++;
        }

        a[i] = iteration;
    }
}

void calulateFractalCPU(unsigned char *a, int i, double c_re, double c_im, int width, int height, double minX, double maxX, double minY, double maxY, double ratioX, double ratioY, int maxLevel)
{
    int x = i % width;
    int y = i / width;

    double p_im = y * ratioY + minY;
    double p_re = x * ratioX + minX;

    double z_re = p_re;
    double z_im = p_im;

    int iteration = 0;

    while ((z_re * z_re + z_im * z_im) < 4 && iteration < 99)
    {
        double tmp_re = z_re * z_re - z_im * z_im + c_re;
        double tmp_im = 2 * z_re * z_im + c_im;
        z_re = tmp_re;
        z_im = tmp_im;
        iteration++;
    }

    a[i] = iteration;
}

int saveFractalToBitmap(unsigned char **colorsArray, unsigned char *bitmap, int width, int height, char *filename)
{
    // Bitmap structures to be written to file
    BITMAPFILEHEADER bfh;
    BITMAPINFOHEADER bih;

    // Fill BITMAPFILEHEADER structure
    memcpy((char *)&bfh.bfType, "BM", 2);
    bfh.bfSize = sizeof(bfh) + sizeof(bih) + 3*height*width;
    bfh.bfReserved1 = 0;
    bfh.bfReserved2 = 0;
    bfh.bfOffBits = sizeof(bfh) + sizeof(bih);

    // Fill BITMAPINFOHEADER structure
    bih.biSize = sizeof(bih);
    bih.biWidth = width;
    bih.biHeight = height;
    bih.biPlanes = 1;
    bih.biBitCount = 24;
    bih.biCompression = BI_RGB; // uncompressed 24-bit RGB
    bih.biSizeImage = 0; // can be zero for BI_RGB bitmaps
    bih.biXPelsPerMeter = 3780; // 96dpi equivalent
    bih.biYPelsPerMeter = 3780;
    bih.biClrUsed = 0;
    bih.biClrImportant = 0;

    // Open bitmap file (binary mode)
    FILE *f;
    f = fopen(filename, "wb");

    if(f == NULL) 
        return -1;

    // Write bitmap file header
    fwrite(&bfh, 1, sizeof(bfh), f);
    fwrite(&bih, 1, sizeof(bih), f);

    // Write bitmap pixel data starting with the
    // bottom line of pixels, left hand side
    for (int i = 0; i < width * height ; i++)
    {
        // Write pixel components in BGR order
        fputc(colorsArray[bitmap[i]][2], f);
        fputc(colorsArray[bitmap[i]][1], f);
        fputc(colorsArray[bitmap[i]][0], f);
    }

    // Close bitmap file
    fclose(f);

    return 0;
}

int main()
{
    unsigned char **colorsArray;
    unsigned char *fractalLevelsCPU;
    unsigned char *fractalLevelsGPU;

    double minX = -1.7;
    double maxX = 1.7;
    double minY = -1.5;
    double maxY = 1.5;

    double input_re = -0.79;
    double input_im = 0.1463;

    int width = 10;
    int height = 5;
    int N = width * height;
    int maxLevel = 100;
    size_t levelsArraySize = N * sizeof(unsigned char);

    double ratioX = (maxX - minX) / (double) width;
    double ratioY = (maxY - minY) / (double) height;

    bool gpu = true;

    // Allocate memory
    colorsArray = (unsigned char**) malloc((maxLevel+1) * sizeof(unsigned char*));
    for(int i=0; i<=maxLevel; i++)
    {
        colorsArray[i] = (unsigned char *) malloc(3 * sizeof(unsigned char));
        colorsArray[i][0] = (int) (255.0 * i / maxLevel);
        colorsArray[i][1] = (int) (255.0 * i / maxLevel);
        colorsArray[i][2] = (int) (255.0 * log((double) i) / log((double) maxLevel));
    }

    fractalLevelsCPU = (unsigned char*) malloc(levelsArraySize);
    cudaMalloc((unsigned char **) &fractalLevelsGPU, levelsArraySize);

    cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyHostToDevice);

    if(gpu)
    {
        // Run GPU method
        calulateFractal <<< 1, N >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

        // Copy data from GPU to CPU array
        cudaMemcpy(fractalLevelsCPU, fractalLevelsGPU, levelsArraySize, cudaMemcpyDeviceToHost);
    }
    else
    {
        // Iterate every element in array and compute level of fractal
        for(int i=0; i<N; i++)
        {
            calulateFractalCPU(fractalLevelsCPU, i, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);
        }
    }

    // Show results
    for(int i=0; i<N; i++)
    {
        if((i % width) == 0) 
            printf("\n");

        printf("%d\t", fractalLevelsCPU[i]);    
    }
    //saveFractalToBitmap(colorsArray, fractalLevelsCPU, width, height, "frac.bmp");

    // Free memory
    for(int i=0; i<=maxLevel; i++)
    {
        free(colorsArray[i]);
    }
    free(colorsArray);

    free(fractalLevelsCPU);
    cudaFree(fractalLevelsGPU);

    return 0;
}

/C libs
#包括
#包括
#包括
//帮助图书馆
#包括
#包括
//库达图书馆
#包括“cuda.h”
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
__全局无效计算分形（无符号字符*a，整数N，双c，双c，双c，整数宽度，整数高度，双最小值，双最大值，双最小值，双最大值，双比率，双比率，双比率，整数最大值）
{
int i=blockIdx.x*blockDim.x+threadIdx.x；
if（i对于（inti=0；i我已经找到了问题的解决方案
首先，每个块的线程数应该是两个数的幂。
我还意识到我的GPU对每个块的线程数和块本身都有限制。
NVIDIA Utils向我展示了我可以使用最多65536个块和每个块512个线程
解决方案：
int threadsPerBlock = 512;
int blocksNumber = N/threadsPerBlock + (N % threadsPerBlock == 0 ? 0:1);

if(blocksNumber > 65536) 
    return -1;

calulateFractal <<< blocksNumber, threadsPerBlock >>> (fractalLevelsGPU, N, input_re, input_im, width, height, minX, maxX, minY, maxY, ratioX, ratioY, maxLevel);

int-threadsPerBlock=512；
int blocksNumber=N/threadsPerBlock+（N%threadsPerBlock==0？0:1）；
如果（块数>65536）
返回-1；
calulateFractal>（fractalLevelsGPU，N，input_re，input_im，width，height，minX，maxX，minY，maxY，ratioX，ratioY，maxLevel）；
您似乎忘了问一个问题。您确定了解cuda的工作原理吗？您确实意识到您正在GPU上同时（或多或少）运行calulateFractal N*N次。还要注意，只执行前N个值（blockIDx为0时，它变为N）。请看这里：[1]：[2]：@localhost…似乎是精度问题。那么您将无法使用double类型的变量。@KiaMorot…请更正您的信息。