CUDA（来自C+；+；）双曲触发器函数在不同位置计算不同的结果_Cuda

CUDA（来自C+；+；）双曲触发器函数在不同位置计算不同的结果

cuda

CUDA（来自C+；+；）双曲触发器函数在不同位置计算不同的结果,cuda,Cuda,我写的一个模拟遇到了一个奇怪的问题。我最近重组了我的代码，使事情更干净、更有条理。基本上（除其他外）我将讨论中的CUDA函数移动（基本上复制粘贴）到另一个文件中。此函数使用asinh计算某物，以及sinh和cosh。我注意到，在移动之前，函数生成的预期结果与手工计算的值（在excel中）一致。移动后，双曲函数被输入相同的输入，但结果却显著不同（在asinh中高达10%，在sinh中高达0.5%）。这实际上破坏了我的模拟。我对该功能的其余部分充满信心编辑：在进一步的测试中，我发现了有关角度（l

我写的一个模拟遇到了一个奇怪的问题。我最近重组了我的代码，使事情更干净、更有条理。基本上（除其他外）我将讨论中的CUDA函数移动（基本上复制粘贴）到另一个文件中。此函数使用

asinh

计算某物，以及

sinh

和

cosh

。我注意到，在移动之前，函数生成的预期结果与手工计算的值（在excel中）一致。移动后，双曲函数被输入相同的输入，但结果却显著不同（在

asinh

中高达10%，在

sinh

中高达0.5%）。这实际上破坏了我的模拟。我对该功能的其余部分充满信心

编辑：在进一步的测试中，我发现了有关角度（lambdaDegrees）的硬编码值，即

double x{asinh（sqrt（3.0）*sin（lambdaDegrees*3.1415927/180.0））-产生（良好的）预期结果。在执行方程式之前和之后测量角度时，角度不变，但如果不对值进行硬编码，则会产生错误的结果。最奇怪的部分是简单地添加另一个诊断printf函数，导致该函数产生另一个（错误的）结果。我想知道这是否与我在GPU上设置回调函数的方式有关……也许多个线程同时使用该函数会导致一些（一致的）未定义的行为
在对代码进行了一些修改后，我重现了错误。getSAtLambda（printf语句）中x的预期值为1.268。。。结果是1.768。。。让我知道你的想法
main.cu
//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"

typedef double(*callbackFcn)(double*, int, double, double, int);

//on GPU global variables
extern __device__ double*     fieldConstArray_GPU;
extern __device__ int         arraySize_GPU;
extern __device__ callbackFcn callback_GPU;

__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd);
__global__ void setupEnvironmentGPU(double* constArrayPtr);

__global__ void execute()
{
    int thdInd{ blockIdx.x * blockDim.x + threadIdx.x };
    callback_GPU(fieldConstArray_GPU, arraySize_GPU, (thdInd == 31487) ? 1233005.097 : ((115200 - thdInd) / 50000.0 * 6.371e6), 0.0, thdInd ); //3rd argument are example values
}

void setupEnvironment()
{// consts: [ B0, ILATDeg, L, L_norm, s_max ]
    double fieldConstArray_h[]{ 3.12e-5, 72.0, 66717978.17, 10.47213595, 85670894.1 };
    double* fieldConstants_d{ nullptr };

    cudaMalloc((void **)&fieldConstants_d, 5 * sizeof(double));
    cudaMemcpy(fieldConstants_d, fieldConstArray_h, 5 * sizeof(double), cudaMemcpyHostToDevice);

    setupEnvironmentGPU <<< 1, 1 >>> (fieldConstants_d);
}

int main()
{
    setupEnvironment();
    int loops{ 0 };

    while (loops < 3)
    {
        execute <<< 115200 / 256, 256 >>> ();
        cudaDeviceSynchronize();
        loops++;
    }

    return 0;
}

//CUDA包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“cuda_profiler_api.h”
typedef double（*callbackpcn）（double*，int，double，double，int）；
//关于GPU全局变量
外部设备双*现场总线阵列GPU；
外部设备阵列化GPU；
外部设备调用回调GPU；
__主机设备双B字段（双*常数，整数数组长度，双s，双simtime，整数thdInd）；
__主机设备双梯度（双*常数，整数排列长度，双s，双同步时间，整数thdInd）；
__全局无效设置环境GPU（双*常数）；
__全局无效执行（）
{
int thdInd{blockIdx.x*blockDim.x+threadIdx.x}；
回调_GPU（fieldconstrarray_GPU，arraySize_GPU，（thdInd==31487）？1233005.097:（（115200-thdInd）/50000.0*6.371e6），0.0，thdInd）；//第三个参数是示例值
}
void setupEnvironment（）
{//consts:[B0，ILATDeg，L，L_范数，s_max]
双字段常量数组_h[]{3.12e-5,72.0,66717978.17,10.47213595,85670894.1}；
双*字段常量{nullptr}；
Cudamaloc（（void**）和fieldConstants_d，5*sizeof（double））；
cudaMemcpy（fieldConstants_d，fieldconstrarray_h，5*sizeof（双精度），cudaMemcpyHostToDevice）；
设置环境GPU>（字段常量d）；
}
int main（）
{
设置环境（）；
int循环{0}；
while（循环<3）
{
执行>（）；
cudaDeviceSynchronize（）；
循环++；
}
返回0；
}

otherfunctions.cu
#include <cmath>
#include <iostream>

//CUDA includes
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda_profiler_api.h"

typedef double(*callbackFcn)(double*, int, double, double, int);

__device__ double*     fieldConstArray_GPU{ nullptr };
__device__ int         arraySize_GPU{ 7 };
__device__ callbackFcn callback_GPU{ nullptr };

__host__ __device__ double getSAtLambda(double* consts, int arrayLength, double lambdaDegrees, double simtime, int thdInd)
{//returns s in units of L
    double x{ asinh(sqrt(3.0) * sin(lambdaDegrees * 3.1415927 / 180.0)) };

    if (simtime == 0.0 && thdInd == 31487) { printf("\n\ngetSAtLambda: %f, %f\n\n", lambdaDegrees, x); }

    return (0.5 * consts[2] / sqrt(3.0)) * (x + sinh(x) * cosh(x));
}

__host__ __device__ double getLambdaAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_tmp{ (-consts[1] / consts[4]) * s + consts[1] }; //-ILAT / s_max * s + ILAT
    double s_tmp{ consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, thdInd) };
    double dlambda{ 1.0 };
    bool   over{ 0 };

    while (abs((s_tmp - s) / s) > 1e-4) //errorTolerance
    {
        while (1)
        {
            over = (s_tmp >= s);
            if (over)
            {
                lambda_tmp += dlambda;
                s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
                if (s_tmp < s)
                    break;
            }
            else
            {
                lambda_tmp -= dlambda;
                s_tmp = consts[4] - getSAtLambda(consts, arrayLength, lambda_tmp, simtime, 0);
                if (s_tmp >= s)
                    break;
            }
        }
        if (dlambda < 1e-4 / 100.0) //errorTolerance
            break;
        dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
    }

    return lambda_tmp;
}

__host__ __device__ double BFieldAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{// consts: [ B0, ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_deg{ getLambdaAtS(consts, arrayLength, s, simtime, thdInd) };
    double lambda_rad{ lambda_deg * 3.1415927 / 180.0 };
    double rnorm{ consts[3] * pow(cos(lambda_rad), 2) };

    return -consts[0] / pow(rnorm, 3) * sqrt(1.0 + 3 * pow(sin(lambda_rad), 2));
}

__host__ __device__ double gradBAtS(double* consts, int arrayLength, double s, double simtime, int thdInd)
{
    return (BFieldAtS(consts, arrayLength, s + consts[5], simtime, thdInd) - BFieldAtS(consts, arrayLength, s - consts[5], simtime, thdInd)) / (2 * consts[5]);
}

__global__ void setupEnvironmentGPU(double* constArrayPtr)
{
    callback_GPU = gradBAtS; //sets pointer to callback function
    arraySize_GPU = 7;
    fieldConstArray_GPU = constArrayPtr;
}

#包括
#包括
//CUDA包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“cuda_profiler_api.h”
typedef double（*callbackpcn）（double*，int，double，double，int）；
__设备uuuuudouble*字段常量数组GPU{nullptr}；
__设备阵列化GPU{7}；
__设备uuu_uuuu回调u GPU{nullptr}；
__主机设备双getSAtLambda（双*常数，整数数组长度，双LambdDegrees，双simtime，整数thdInd）
{//返回以L为单位的s
双x{asinh（sqrt（3.0）*sin（lambdaDegrees*3.1415927/180.0））；
如果（simtime==0.0&&thdInd==31487）{printf（“\n\ngetSAtLambda:%f，%f\n\n”，lambdaDegrees，x）；}
返回（0.5*consts[2]/sqrt（3.0））*（x+sinh（x）*cosh（x））；
}
__主机设备双GetLambdats（双*常数，整数数组长度，双s，双simtime，整数thdInd）
{//consts:[B0，ILATDeg，L，L_范数，s_max，ds，容错]
双lambda_tmp{（-consts[1]/consts[4]）*s+consts[1]}；//-ILAT/s_max*s+ILAT
双s_tmp{consts[4]-getSAtLambda（consts，arrayLength，lambda_tmp，simtime，thdInd）}；
双dlambda{1.0}；
{0}上的布尔；
while（abs（（s_tmp-s）/s）>1e-4）//容错
{
而(1)
{
超过=（s_tmp>=s）；
如果（超过）
{
lambda_tmp+=dlambda；
s_tmp=consts[4]-getSAtLambda（consts，arraylelength，lambda_tmp，simtime，0）；
如果（s_tmp=s）
打破
}
}
if（dlambda<1e-4/100.0）//容错
打破
dlambda/=5.0；//通过反复试验，这通常会减少计算次数（与2、2.5、3、4、10相比）
}
返回lambda_tmp；
}
__主机设备双B字段（双*常数，整数数组长度，双s，双simtime，整数thdInd）
{//consts:[B0，ILATDeg，L，L_范数，s_max，ds，容错]
双λ{getLambdaAtS（常数，排列长度，s，simtime，thdInd）}；
双lambda_rad{lambda_deg*3.1415927/180.0}；
双范数{consts[3]*pow（cos（lambda_rad），2）}；
返回常数[0]/pow（rnorm，3）*sqrt（1.0+3*pow（sin（lambda_rad），2））；
}
__主机设备双梯度（双*常数，整数排列长度，双s，双simtime，整数thdInd）
{
返回（b字段数据（常量，数组长度，s+常量[5]，simtime，thdInd）-b字段数据（常量，数组长度，s-c