C++ cudaDeviceSynchronize（）错误代码77:CUDAErrorRigalAddress_C++_Memory_Cuda_Gpu

C++ cudaDeviceSynchronize（）错误代码77:CUDAErrorRigalAddress

c++ memory cuda

C++ cudaDeviceSynchronize（）错误代码77:CUDAErrorRigalAddress,c++,memory,cuda,gpu,C++,Memory,Cuda,Gpu,非常感谢您阅读我的帖子我正在做CUDA工作，但一直在获取cudaDeviceSynchronize（）错误代码77:CUDAerrorillegaAddress，不知道为什么。我搜索了代码和函数，令人惊讶的是，只出现了几条记录。很奇怪我基本上总结了图像的所有像素。为了使我的问题具有尽可能多的参考价值，我在这里展示了我所有的CUDA代码： #include "cuda_runtime.h" #include "device_launch_parameters.h" #include "thor

非常感谢您阅读我的帖子

我正在做CUDA工作，但一直在获取cudaDeviceSynchronize（）错误代码77:CUDAerrorillegaAddress，不知道为什么。我搜索了代码和函数，令人惊讶的是，只出现了几条记录。很奇怪

我基本上总结了图像的所有像素。为了使我的问题具有尽可能多的参考价值，我在这里展示了我所有的CUDA代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>

using namespace std;

float random_float(void)
{
  return static_cast<float>(rand()) / RAND_MAX;
}


__global__ void reduceSum(unsigned short *input,
                          unsigned long long *per_block_results,
                          const int n)
{
    extern __shared__ unsigned long long sdata[];

    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

    // load input into __shared__ memory
    unsigned short x = 0;
    if(i < n)
    {
        x = input[i];
    }
    sdata[threadIdx.x] = x;
    __syncthreads();

    // contiguous range pattern
    for(int offset = blockDim.x / 2; offset > 0; offset >>= 1)
    {
        if(threadIdx.x < offset)
        {
            // add a partial sum upstream to our own
            sdata[threadIdx.x] += sdata[threadIdx.x + offset];
        }

        // wait until all threads in the block have
        // updated their partial sums
        __syncthreads();
    }

    // thread 0 writes the final result
    if(threadIdx.x == 0)
    {
        per_block_results[blockIdx.x] = sdata[0];
    }
}

// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean,  int N,  vector<string> filelist)
{
    int size = N*N;
    unsigned long long* dev_sum = 0;
    unsigned short* dev_img = 0;
    cudaError_t cudaStatus;
    const int block_size = 512;
    const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
    int L = filelist.size();

    // Choose which GPU to run on, change this on a multi-GPU system.

    double totalgpuinittime = 0;
    StartCounter(7);

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    totalgpuinittime = GetCounter(7);

    unsigned short* img;
    unsigned short* pimg;
    unsigned long long* sum = new unsigned long long[num_blocks];
    unsigned long long* psum = sum;

    cout<<endl;
    cout << "gpu looping starts, and in progress ..." << endl;
    StartCounter(6);

    double totalfileiotime = 0;
    double totalh2dcpytime = 0;
    double totalkerneltime = 0;
    double totald2hcpytime = 0;
    double totalcpusumtime = 0;
    double totalloopingtime = 0;

    for (int k = 0; k < L; k++)
    {
        StartCounter(1);
        img = (unsigned short*)LoadTIFF(filelist[k].c_str());
        totalfileiotime += GetCounter(1);

        psum = sum;
        pimg = img;

        float gpumean = 0;

        memset(psum, 0, sizeof(unsigned long long)*num_blocks);

        StartCounter(2);
        // Copy input vectors from host memory to GPU buffers.
        cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }

        cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }   

        totalh2dcpytime += GetCounter(2);

        StartCounter(3);
        //reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
         //reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
          reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
        totalkerneltime += GetCounter(3);

      // Check for any errors launching the kernel
        cudaStatus = cudaGetLastError();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
            goto Error;
        }

        // cudaDeviceSynchronize waits for the kernel to finish, and returns
        // any errors encountered during the launch.

                // !!!!!! following is where the code 77 error occurs!!!!!!!
        cudaStatus = cudaDeviceSynchronize();
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
            goto Error;
        }

        // Copy output vector from GPU buffer to host memory.
        StartCounter(4);
        cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long ), cudaMemcpyDeviceToHost);
        if (cudaStatus != cudaSuccess) 
        {
            fprintf(stderr, "cudaMemcpy failed!");
            goto Error;
        }
        totald2hcpytime += GetCounter(4);

        StartCounter(5);
        for (int i = 0; i < num_blocks; i++)
        {
            gpumean += *psum;
            psum++;
        }

        gpumean /= N*N;
        totalcpusumtime += GetCounter(5);

        delete img; 
        img = NULL;

        cout<<gpumean<<endl;

    }

    int S = 1e+6;
    int F = filelist.size();
    float R = S/F;

    totalloopingtime = GetCounter(6);
    cout<<"gpu looping ends."<<endl<<endl;
    cout<< "analysis:"<<endl;
    cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
    cout<<"file I/O time: "<<endl;
    cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
    cout<<"host-to-device copy time: "<<endl;
    cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
    cout<<"pure gpu kerneling time: "<<endl;
    cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
    cout<<"device-to-host copy time: "<<endl;
    cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
    /*cout<<"cpu summing time: "<<endl;
    cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/

    /*cout <<"gpu looping time: " << endl;
    cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/


Error:
    cudaFree(dev_sum);
    cudaFree(dev_img);

    delete sum;
    sum = NULL;

    return cudaStatus;
}

void kernel(float* &mean, int N, vector<string> filelist)
{
    // wrapper and kernel
    cudaError_t cudaStatus = gpuWrapper(mean, N,  filelist);

    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "gpuWapper failed!");

    }

   // printf("mean is: %f\n", mean);

    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    StartCounter(8);
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) 
    {
        fprintf(stderr, "cudaDeviceReset failed!");

    }
    cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
    //return *mean;
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括“thorcalgpu.h”
#包括
#包括“math.h”
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用名称空间std；
浮动随机\u浮动（无效）
{
返回static_cast（rand（））/rand_MAX；
}
__全局无效还原（无符号短*输入，
无符号长*每块结果，
常数（整数）
{
外部共享无符号长sdata[]；
无符号整数i=blockIdx.x*blockDim.x+threadIdx.x；
//将输入加载到共享内存中
无符号短x=0；
if（i0；offset>>=1）
{
if（螺纹IDX.x<偏移量）
{
//在我们自己的上游加上一部分金额
sdata[threadIdx.x]+=sdata[threadIdx.x+偏移量]；
}
//等待，直到块中的所有线程都已完成
//更新了他们的部分总和
__同步线程（）；
}
//线程0写入最终结果
if（threadIdx.x==0）
{
每块结果[blockIdx.x]=sdata[0]；
}
}
//辅助函数，用于使用CUDA并行添加向量。
//模板
cudaError\u t gpuWrapper（浮点*平均值，整数N，矢量文件列表）
{
int size=N*N；
无符号长*dev_sum=0；
无符号短*dev_img=0；
cudaError\u t cudaStatus；
const int block_size=512；
const int num_blocks=（大小/块大小）+（大小百分比块大小）？1:0；
int L=filelist.size（）；
//选择要在哪个GPU上运行，在多GPU系统上更改此选项。
double totalgpuinittime=0；
StartCounter（7）；
cudaStatus=cudaSetDevice（0）；
if（cudaStatus！=cudaSuccess）
{
fprintf（stderr，“cudaSetDevice失败！是否安装了支持CUDA的GPU？”）；
转到错误；
}
//为三个向量分配GPU缓冲区（两个输入，一个输出）。
cudaStatus=cudaMalloc（（无效**）和dev_img，大小*大小（无符号短）；
if（cudaStatus！=cudaSuccess）
{
fprintf（stderr，“cudamaloc失败！”）；
转到错误；
}
cudaStatus=cudamaloc（（void**）和dev_sum，num_blocks*sizeof（unsigned long））；
if（cudaStatus！=cudaSuccess）
{
fprintf（stderr，“cudamaloc失败！”）；
转到错误；
}
totalgpuinittime=GetCounter（7）；
无符号短*img；
无符号短*pimg；
无符号长*和=新的无符号长[num_块]；
无符号长*psum=和；
cout虽然这可能不是代码中唯一的错误源，但是您没有为精简内核分配任何动态共享内存，这导致了您看到的非法寻址错误。正确的内核启动应该类似于
size_t shm_size = block_size * sizeof(unsigned long long);
reduceSum<<<num_blocks,block_size,shm_size>>>(dev_img, dev_sum, size);

size\t shm\u size=块大小*sizeof（无符号长）；
减少（偏差、偏差和、尺寸）；

这将为reduce内核中运行的每个线程分配相当于一个unsigned long long long的长度，这（通过我对代码的粗略阅读）将使共享内存数组sdata
成为内核运行时的正确大小，而不会越界访问该数组