C++ cuda错误:意外启动失败
我是cuda的新手,我正在尝试在cuda上实现数值积分 我的代码出现错误“意外启动失败”。它看起来像是gpu内存中的某个故障。但这很奇怪,因为它依赖于变量C++ cuda错误:意外启动失败,c++,cuda,C++,Cuda,我是cuda的新手,我正在尝试在cuda上实现数值积分 我的代码出现错误“意外启动失败”。它看起来像是gpu内存中的某个故障。但这很奇怪,因为它依赖于变量step,该变量控制迭代次数,而不是任何内存操作。例如,当我使用step=0.00001运行integrate\u with_cuda时,它工作正常,结果是正确的,但如果我在0.000001上更改step,我的程序将失败 这是我的代码: #include "device_launch_parameters.h" #include "cuda_r
step
,该变量控制迭代次数,而不是任何内存操作。例如,当我使用step=0.00001
运行integrate\u with_cuda
时,它工作正常,结果是正确的,但如果我在0.000001
上更改step
,我的程序将失败
这是我的代码:
#include "device_launch_parameters.h"
#include "cuda_runtime_api.h"
#include "cuda.h"
#include "cuda_safe_call.h"
#include <cmath>
#include <iostream>
typedef double(*cuda_func)(double, double);
struct cuda_expr {
cuda_func func;
int dest;
int op1;
int op2;
};
enum cuda_method {
cm_Add,
cm_Mult
};
__device__ double add_func(double x, double y) {
return x + y;
}
__device__ cuda_func p_add_func = add_func;
__device__ double mult_func(double x, double y) {
return x*y;
}
__device__ cuda_func p_mult_func = mult_func;
__host__ cuda_func get_cuda_func(cuda_method method) {
cuda_func result = NULL;
switch (method) {
case cm_Add:
cudaMemcpyFromSymbol(&result, p_add_func, sizeof(cuda_func));
break;
case cm_Mult:
cudaMemcpyFromSymbol(&result, p_mult_func, sizeof(cuda_func));
break;
}
return result;
}
__device__ double atomicAdd(double* address, double val)
{
unsigned long long int* address_as_ull =
(unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val +
__longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ void computate_expr(cuda_expr* expr, int expr_length, double* vars, int vars_count) {
for (cuda_expr* step = expr, *end = expr + expr_length; step != end; ++step) {
vars[step->dest] = (*step->func)(vars[step->op1], vars[step->op2]);
}
}
__device__ double simpson_step(cuda_expr* expr, int expr_length, double* vars, int vars_count, double a, double b, double c) {
double f_a;
double f_b;
double f_c;
vars[0] = a;
computate_expr(expr, expr_length, vars, vars_count);
f_a = vars[vars_count - 1];
vars[0] = b;
computate_expr(expr, expr_length, vars, vars_count);
f_b = vars[vars_count - 1];
vars[0] = c;
computate_expr(expr, expr_length, vars, vars_count);
f_c = vars[vars_count - 1];
return (c - a) / 6 * (f_a + 4 * f_b + f_c);
}
__global__ void integrate_kernel(cuda_expr* expr, int expr_length, double* vars, int vars_count, double from, double to, double step, double* res) {
int index = blockIdx.x*blockDim.x + threadIdx.x;
int threads_count = gridDim.x*blockDim.x;
double* my_vars = vars + index * vars_count;
double my_from = from + index*(to - from) / threads_count;
double my_to = from + (index + 1)*(to - from) / threads_count;
double my_res = 0;
double a = my_from;
double b = my_from + step / 2;
double c = my_from + step;
while (c < (my_to + step / 10)) {
my_res += simpson_step(expr, expr_length, my_vars, vars_count, a, b, c);
a += step;
b += step;
c += step;
}
atomicAdd(res, my_res);
}
__host__ double integrate_with_cuda(const cuda_expr* expr, int expr_length, double* vars, int vars_count, double from, double to, double step) {
const int blockSize = 32;
const int gridSize = 2;
const int threadsCount = blockSize*gridSize;
cuda_expr* d_expr;
CudaSafeCall(cudaMalloc((void**)&d_expr, expr_length*sizeof(cuda_expr)));
CudaSafeCall(cudaMemcpy(d_expr, expr, expr_length*sizeof(cuda_expr), cudaMemcpyHostToDevice));
double* d_vars; //allocate own vars array for every thread
CudaSafeCall(cudaMalloc((void**)&d_vars, threadsCount*vars_count*sizeof(double)));
for (int i = 0; i < threadsCount; ++i) {
CudaSafeCall(cudaMemcpy(d_vars + i*vars_count, vars, vars_count*sizeof(double), cudaMemcpyHostToDevice));
}
double* d_res;
double result = 0;
CudaSafeCall(cudaMalloc((void**)&d_res, sizeof(double)));
CudaSafeCall(cudaMemcpy(d_res, &result, sizeof(double), cudaMemcpyHostToDevice));
integrate_kernel<<<gridSize, blockSize>>>(d_expr, expr_length, d_vars, vars_count, from, to, step, d_res);
CudaSafeCall(cudaMemcpy(&result, d_res, sizeof(double), cudaMemcpyDeviceToHost));
CudaSafeCall(cudaFree(d_expr));
CudaSafeCall(cudaFree(d_vars));
CudaSafeCall(cudaFree(d_res));
return result;
}
int main() {
cuda_expr expr[3] = {
{ get_cuda_func(cuda_method::cm_Add), 4, 1, 0 },
{ get_cuda_func(cuda_method::cm_Add), 3, 0, 2 },
{ get_cuda_func(cuda_method::cm_Mult), 5, 3, 4 }
};
double vars[6] = {0, 10, 1, 0, 0, 0};
double res = integrate_with_cuda(expr, 3, vars, 6, 0, 10, 0.00001);
std::cout << res << std::endl;
system("PAUSE");
}
谢谢。对不起,我的英文是:)当我在linux上运行你的代码时,我得到了983.333,不管我是将
步骤指定为0.00001还是0.000001。然而,对于较小的步骤,代码运行的时间要长得多
由于您是在windows上运行的,这可能只不过是您点击。在windows上运行超过2秒的内核可能会触发TDR机制。通常在这种情况下,当GPU通过windows触发重置时,您会看到屏幕闪烁为黑色,然后重新绘制自身。您还可能会看到systray消息。如果从VS内部运行代码,与从命令行运行代码相比,确切的行为也可能有所不同
有关如何修改TDR机制的信息,请参考上述链接或在CUDA标签上搜索。是,谢谢!我实际上看到了屏幕上的闪光和关于恢复GPU驱动程序的信息。
nvcc.exe -gencode=arch=compute_30,code=\"sm_30,compute_30\" --use-local-env --cl-version 2012 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 11.0\VC\bin\x86_amd64" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v6.0\include" --keep-dir x64\Tests -maxrregcount=0 --machine 64 --compile -cudart static -DWIN32 -D_DEBUG -D_UNICODE -DUNICODE -Xcompiler "/EHsc /W3 /nologo /Od /Zi /MDd " -o x64\Tests\integration_on_cuda.cu.obj integration_on_cuda.cu