程序启动时调用cudaDeviceSynchronize()后出现未指定的启动失败。但使用逐步调试不会出现错误。库达

程序启动时调用cudaDeviceSynchronize()后出现未指定的启动失败。但使用逐步调试不会出现错误。库达,cuda,visual-studio-debugging,Cuda,Visual Studio Debugging,我花了几个小时与未指明的启动失败进行斗争。 为了了解共享内存是如何工作的,我为自己想出了一个小任务 任务是将数组[1,2,3,…,N]分成K组(N/K)个元素,并求每组的和。(数组的当前元素和上一个元素之间的差值等于1) 我计划在K个块之间划分的网格中使用N个线程。因此,每个threadblock都包含(N/K)个线程。因此,一个threadblock可以用来计算一个组的和。我还想动态分配共享内存 启动程序时,在调用cudaDeviceSynchronize()之后,我遇到了未指定的启动失败。但

我花了几个小时与
未指明的启动失败进行斗争
。 为了了解共享内存是如何工作的,我为自己想出了一个小任务

任务是将数组[1,2,3,…,N]分成K组(N/K)个元素,并求每组的和。(数组的当前元素和上一个元素之间的差值等于1)

我计划在K个块之间划分的网格中使用N个线程。因此,每个threadblock都包含(N/K)个线程。因此,一个threadblock可以用来计算一个组的和。我还想动态分配共享内存

启动程序时,在调用
cudaDeviceSynchronize()
之后,我遇到了
未指定的启动失败。但当我尝试一步一步地调试时,一切都很好

我做错了什么?(Visual Studio 2012专业版,计算能力2.1)我非常感谢您的帮助

#include <stdio.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    printf("%s\n",cudaGetErrorString(x)); \
    system("pause"); \
    return EXIT_FAILURE;}} while(0)

extern __shared__ double shrd[];

__global__ void kernel(double * a){
    size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
    size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
    size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
    double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
    temp[threadID_block] = static_cast<double>(threadID_global);

    __syncthreads();
    if (threadID_block == 0){
        a[blockID_global] = 0.0;
        for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
            a[blockID_global] += temp[index];
        }
    }
}

int main(){

    int devNum = 0;
    CUDA_CALL(cudaGetDevice(&devNum));
    CUDA_CALL(cudaSetDevice(devNum));


    dim3 gridSize(2,2,1);
    dim3 blockSize(4,4,1);

    double * dev_a = NULL;
    size_t length = gridSize.x * gridSize.y ;
    size_t byteSize = length * sizeof(double);
    CUDA_CALL(cudaMalloc(&dev_a,byteSize));

    size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
    kernel <<< gridSize, blockSize,  shmem_perBlock >>> (dev_a);
    CUDA_CALL(cudaGetLastError());
    CUDA_CALL(cudaDeviceSynchronize());

    double * a = new double [length];
    CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));

    for (size_t index = 0; index < length; index++){
        printf("%.3f\n",a[index]);
    }

    printf("\n");

    CUDA_CALL(cudaFree(dev_a));
    CUDA_CALL(cudaDeviceReset());
    delete[]a;

    system("pause");
    return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义CUDA_调用(x)do{if((x)!=cudaSuccess){\
printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
printf(“%s\n”,cudaGetErrorString(x))\
系统(“暂停”)\
返回EXIT_FAILURE;}}while(0)
外部共享双shrd[];
__全局无效内核(双*a){
大小\u t threadID\u block=blockDim.x*threadIdx.y+threadIdx.x;
大小\u t块ID\u全局=(gridDim.x*blockIdx.y+blockIdx.x);
size\u t threadID\u global=blockID\u global*blockDim.x*blockDim.y+threadID\u block;
double*temp=&shrd[blockID_global*blockDim.x*blockDim.y];
temp[threadID\u block]=静态线程转换(threadID\u全局);
__同步线程();
如果(线程ID_块==0){
a[blockID_global]=0.0;
对于(大小索引=0;索引(dev_a);
CUDA_调用(cudaGetLastError());
CUDA_调用(cudaDeviceSynchronize());
double*a=新的double[长度];
CUDA_调用(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));
对于(大小索引=0;索引<长度;索引++){
printf(“%.3f\n”,一个[索引]);
}
printf(“\n”);
CUDA_CALL(cudaFree(dev_a));
CUDA_调用(cudadeviceset());
删除[]a;
系统(“暂停”);
返回0;
}

如果您使用开普勒或更高版本,请先阅读以下内容:

否则,如果您是开普勒之前的人,请阅读以下内容:

在CUDA编程方面,您缺少一些基础知识。我在下面给了你一个代码模板。这是为了澄清其中一些基本原则。不要期望这会被优化,因为我期望您对并行缩减进行编程。这将使您了解如何使用共享内存

祝你好运

#include <stdio.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 10000
#define K 100

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    printf("%s\n",cudaGetErrorString(x)); \
    system("pause"); \
    return EXIT_FAILURE;}} while(0)

__global__ void kernel(double* a, double* results){

    extern __shared__ double shared[];

    size_t tid, tid_local, stride;
    tid       = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
    tid_local = threadIdx.x;                      //thread id within a block
    stride    = blockDim.x*gridDim.x;             //total number of threads

    double *start = &a[K*blockIdx.x]; //each block will get K of a block.

    shared[tid_local]=start[tid_local]; //copy K elements into shared memory
    __syncthreads();

    //Perform Parallel reduction, you will have to implement this
    //After parallel reduction, result should be in shared[0]

    //for demonstration I made the code serial for each block on thread 0.
    //This is for demonstration only.
    double sum=0;
    if(tid_local==0){
        for(int i=0; i<K; i++){
            sum+=shared[i];
        }

        a[blockIdx.x]=sum;
    }

}

int main(){

    int devNum = 0;
    CUDA_CALL(cudaGetDevice(&devNum));
    CUDA_CALL(cudaSetDevice(devNum));


    double * dev_a = NULL;
    double * dev_results=NULL;

    CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
    CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));

    //copy dev_a onto GPU (this is the array you are summing).

    dim3 block_size(K,   1, 1);
    dim3 grid_size (N/K, 1, 1);

    size_t shmem_perBlock = K * sizeof(double);

    kernel <<< grid_size, block_size,  shmem_perBlock >>> (dev_a, dev_results);

    CUDA_CALL(cudaGetLastError());
    CUDA_CALL(cudaDeviceSynchronize());

    //copy dev_results back to CPU, this is your result.

    CUDA_CALL(cudaFree(dev_a));
    CUDA_CALL(cudaFree(dev_results));

    system("pause");
    return 0;
}
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义N 10000
#定义K100
#定义CUDA_调用(x)do{if((x)!=cudaSuccess){\
printf(“在%s处出现错误:%d\n”,\uuuuu文件\uuuuu,\uuuu行\uuuuu)\
printf(“%s\n”,cudaGetErrorString(x))\
系统(“暂停”)\
返回EXIT_FAILURE;}}while(0)
__全局无效内核(双*a,双*results){
外部共享双共享[];
大小tid、tid本地、步幅;
tid=blockDim.x*blockIdx.x+threadIdx.x;//所有块内的线程id
tid_local=threadIdx.x;//块内的线程id
stride=blockDim.x*gridDim.x;//线程总数
double*start=&a[K*blockIdx.x];//每个块将获得一个块的K。
共享的[tid_local]=开始[tid_local];//将K个元素复制到共享内存中
__同步线程();
//如果要执行并行缩减,则必须执行此操作
//并行归约后,结果应为共享[0]
//为了演示,我对线程0上的每个块进行了代码串行。
//这仅用于演示。
双和=0;
如果(tid_local==0){
for(int i=0;i(dev_a,dev_results);
CUDA_调用(cudaGetLastError());
CUDA_调用(cudaDeviceSynchronize());
//将开发结果复制回CPU,这是您的结果。
CUDA_CALL(cudaFree(dev_a));
CUDA_呼叫(cudaFree(开发结果));
系统(“暂停”);
返回0;
}

你试过cudaMemchk吗?
cuda memcheck
报告了大量的越界共享内存写入。简单看一下代码就知道这是由于
temp
@talonmies,@Park Young Bae,我试过cuda memcheck中的越界指针造成的。结果是48个错误,比如
无效的共享内存写入大小为8
,2个错误
Progam在CUDA API调用cudaDeviceSynchronize时出错30,在cudaModuleUnload调用cudaModuleUnload时出错719。我猜这是因为异步程序执行:内核启动,尽管
外部共享\uuuuuuuu的内存是双shrd[]
未分配。而且我是CUDA的新手,所以我可能错了。有什么方法可以修复它吗?我的意思是让内核执行等待到
extern\uuuuu shared\uuuuuu双shrd[]的内存
is allocated?@a.yuzhanin:代码的问题主要是对共享内存的越界内存访问。我强烈建议首先解决这个问题。