C++ cudaDeviceSynchronize()错误代码77:CUDAErrorRigalAddress

C++ cudaDeviceSynchronize()错误代码77:CUDAErrorRigalAddress,c++,memory,cuda,gpu,C++,Memory,Cuda,Gpu,非常感谢您阅读我的帖子 我正在做CUDA工作,但一直在获取cudaDeviceSynchronize()错误代码77:CUDAerrorillegaAddress,不知道为什么。我搜索了代码和函数,令人惊讶的是,只出现了几条记录。很奇怪 我基本上总结了图像的所有像素。为了使我的问题具有尽可能多的参考价值,我在这里展示了我所有的CUDA代码: #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "thor

非常感谢您阅读我的帖子

我正在做CUDA工作,但一直在获取cudaDeviceSynchronize()错误代码77:CUDAerrorillegaAddress,不知道为什么。我搜索了代码和函数,令人惊讶的是,只出现了几条记录。很奇怪

我基本上总结了图像的所有像素。为了使我的问题具有尽可能多的参考价值,我在这里展示了我所有的CUDA代码:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>

using namespace std;

float random_float(void)
{
  return static_cast<float>(rand()) / RAND_MAX;
}


__global__ void reduceSum(unsigned short *input,
                          unsigned long long *per_block_results,
                          const int n)
{
    extern __shared__ unsigned long long sdata[];

    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    unsigned short x = 0;
    if(i < n)
    {
        x = input[i];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        if(threadIdx.x < offset)
        {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }

        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0)
    {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean,  int N,  vector<string> filelist)
{
    int size = N*N;
    unsigned long long* dev_sum = 0;
    unsigned short* dev_img = 0;
    cudaError_t cudaStatus;
    const int block_size = 512;
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
    int L = filelist.size();

    // Choose which GPU to run on, change this on a multi-GPU system.

    double totalgpuinittime = 0;
    StartCounter(7);

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    totalgpuinittime = GetCounter(7);

    unsigned short* img;
    unsigned short* pimg;
    unsigned long long* sum = new unsigned long long[num_blocks];
    unsigned long long* psum = sum;

    cout<<endl;
    cout << "gpu looping starts, and in progress ..." << endl;
    StartCounter(6);

    double totalfileiotime = 0;
    double totalh2dcpytime = 0;
    double totalkerneltime = 0;
    double totald2hcpytime = 0;
    double totalcpusumtime = 0;
    double totalloopingtime = 0;

    for (int k = 0; k < L; k++)
    {
        StartCounter(1);
        img = (unsigned short*)LoadTIFF(filelist[k].c_str());
        totalfileiotime += GetCounter(1);

        psum = sum;
        pimg = img;

        float gpumean = 0;

        memset(psum, 0, sizeof(unsigned long long)*num_blocks);

        StartCounter(2);
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }

        cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }   

        totalh2dcpytime += GetCounter(2);

        StartCounter(3);
        //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
         //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
          reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
        totalkerneltime += GetCounter(3);

      // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;
        }

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.

                // !!!!!! following is where the code 77 error occurs!!!!!!!
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
            goto Error;
        }

        // Copy output vector from GPU buffer to host memory.
        StartCounter(4);
        cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        totald2hcpytime += GetCounter(4);

        StartCounter(5);
        for (int i = 0; i < num_blocks; i++)
        {
            gpumean += *psum;
            psum++;
        }

        gpumean /= N*N;
        totalcpusumtime += GetCounter(5);

        delete img; 
        img = NULL;

        cout<<gpumean<<endl;

    }

    int S = 1e+6;
    int F = filelist.size();
    float R = S/F;

    totalloopingtime = GetCounter(6);
    cout<<"gpu looping ends."<<endl<<endl;
    cout<< "analysis:"<<endl;
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
    cout<<"file I/O time: "<<endl;
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
    cout<<"host-to-device copy time: "<<endl;
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
    cout<<"pure gpu kerneling time: "<<endl;
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
    cout<<"device-to-host copy time: "<<endl;
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
    /*cout<<"cpu summing time: "<<endl;
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/

    /*cout <<"gpu looping time: " << endl;
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/


Error:
    cudaFree(dev_sum);
    cudaFree(dev_img);

    delete sum;
    sum = NULL;

    return cudaStatus;
}

void kernel(float* &mean, int N, vector<string> filelist)
{
    // wrapper and kernel
    cudaError_t cudaStatus = gpuWrapper(mean, N,  filelist);

    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "gpuWapper failed!");

    }

   // printf("mean is: %f\n", mean);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    StartCounter(8);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceReset failed!");

    }
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
    //return *mean;
}
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“thorcalgpu.h”
#包括
#包括“math.h”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间std;
浮动随机\u浮动(无效)
{
返回static_cast(rand())/rand_MAX;
}
__全局无效还原(无符号短*输入,
无符号长*每块结果,
常数(整数)
{
外部共享无符号长sdata[];
无符号整数i=blockIdx.x*blockDim.x+threadIdx.x;
//将输入加载到共享内存中
无符号短x=0;
if(i0;offset>>=1)
{
if(螺纹IDX.x<偏移量)
{
//在我们自己的上游加上一部分金额
sdata[threadIdx.x]+=sdata[threadIdx.x+偏移量];
}
//等待,直到块中的所有线程都已完成
//更新了他们的部分总和
__同步线程();
}
//线程0写入最终结果
if(threadIdx.x==0)
{
每块结果[blockIdx.x]=sdata[0];
}
}
//辅助函数,用于使用CUDA并行添加向量。
//模板
cudaError\u t gpuWrapper(浮点*平均值,整数N,矢量文件列表)
{
int size=N*N;
无符号长*dev_sum=0;
无符号短*dev_img=0;
cudaError\u t cudaStatus;
const int block_size=512;
const int num_blocks=(大小/块大小)+(大小百分比块大小)?1:0;
int L=filelist.size();
//选择要在哪个GPU上运行,在多GPU系统上更改此选项。
double totalgpuinittime=0;
StartCounter(7);
cudaStatus=cudaSetDevice(0);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,“cudaSetDevice失败!是否安装了支持CUDA的GPU?”);
转到错误;
}
//为三个向量分配GPU缓冲区(两个输入,一个输出)。
cudaStatus=cudaMalloc((无效**)和dev_img,大小*大小(无符号短);
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,“cudamaloc失败!”);
转到错误;
}
cudaStatus=cudamaloc((void**)和dev_sum,num_blocks*sizeof(unsigned long));
if(cudaStatus!=cudaSuccess)
{
fprintf(stderr,“cudamaloc失败!”);
转到错误;
}
totalgpuinittime=GetCounter(7);
无符号短*img;
无符号短*pimg;
无符号长*和=新的无符号长[num_块];
无符号长*psum=和;

cout虽然这可能不是代码中唯一的错误源,但是您没有为精简内核分配任何动态共享内存,这导致了您看到的非法寻址错误。正确的内核启动应该类似于

size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);
size\t shm\u size=块大小*sizeof(无符号长);
减少(偏差、偏差和、尺寸);
这将为reduce内核中运行的每个线程分配相当于一个unsigned long long long的长度,这(通过我对代码的粗略阅读)将使共享内存数组
sdata
成为内核运行时的正确大小,而不会越界访问该数组