Memory Cuda:内核调用后内存中的数据损坏

Memory Cuda:内核调用后内存中的数据损坏,memory,cuda,Memory,Cuda,请查看此代码: #include <stdlib.h> #include <stdio.h> int N, L, I; float * inputs; float * temp; // first kernel __global__ void mulKernel ( float * output, float * inputs)///, float * weights) { int idx = blockIdx.x * blockDim.x + threadId

请查看此代码:

#include <stdlib.h>
#include <stdio.h>

int N, L, I;
float * inputs;
float * temp;

// first kernel
__global__ void mulKernel ( float * output, float * inputs)///, float * weights)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   output [idx] = inputs [idx] * 3;//weights [idx];
   //weights [idx] = 4;

   //__syncthreads();
}

//second kernel
__global__ void sumKernel ( float * output, float * input)
{
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   output [idx] = input[idx]*2;

   __syncthreads();
}

void printVector (const float *p, const int N) {
    for (int i=0; i<N; i++)
    printf("%f\n",p[i]);
}

int main(int argc, char *argv[])
{
    if(argc < 3)
        printf("Usage: cuda <layers> <inputs>\n");
    else
    {
        L = atoi(argv[1]);
        N = atoi(argv[2]);
        I = atoi(argv[2]);
        inputs = (float*)malloc(I*sizeof(float));
        float * weights = (float*)malloc(I*sizeof(float));

        // and fill with some arbitrary values
        for (int i=0; i<I; i++)
        {
            inputs[i] = 1;
        }
        for (int i=0; i<I; i++)
        {
            weights[i] = 1.5;
        }

        // allocate device memory
        float * devInputs = NULL;
        float * devTemp = NULL;
        float * devWeights = NULL;

        cudaMalloc ( (void**)&devInputs, I*sizeof(float) );
        cudaMalloc ( (void**)&devTemp, I*sizeof(float) );
        cudaMalloc ( (void**)&devWeights, I*sizeof(float) );

        // set kernel launch configuration
        dim3 threadsMul = dim3(512, 1);
        int blocksCount = floor(I / threadsMul.x) + 1;
        dim3 blocksMul  = dim3(blocksCount, 1);

        dim3 threadsSum = dim3(512, 1);
        blocksCount = floor(I / threadsSum.x) + 1;
        dim3 blocksSum  = dim3(blocksCount, 1);

        cudaMemcpy      ( devInputs, inputs, I*sizeof(float), cudaMemcpyHostToDevice );
        cudaMemcpy      ( devWeights, weights,I*sizeof(float), cudaMemcpyHostToDevice );

        //kernels calling in this cycle
        for(int j=0;j<L;j++)
        {
            // copying data to see that's ok
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

            // print it
          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

            // running first kernel
          mulKernel<<<blocksMul, threadsMul>>>(devTemp, devInputs);//, devWeights);

            // copying and printing data. We can see thats array weights contains a wrong values
          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n");

          sumKernel<<<blocksSum, threadsSum>>>(devInputs, devTemp);

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");

          if(cudaDeviceSynchronize() == cudaSuccess)
            printf("threads syncronized\n");

          cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );
          cudaMemcpy      ( weights, devWeights, I*sizeof(float), cudaMemcpyDeviceToHost );

          printf("inputs:\n");
          printVector (inputs, N);
          printf("weights:\n");
          printVector (weights, N);
          printf("\n\n");
        }

        cudaMemcpy      ( inputs, devInputs, I*sizeof(float), cudaMemcpyDeviceToHost );

        cudaFree         ( devInputs   );
        cudaFree         ( devTemp   );
        cudaFree         ( devWeights   );

        printVector (inputs, N);

        free(inputs);
        free(weights);
    }
    return 0;
}
然后查看输出。调用第一个内核后,devWeights数组丢失了数据。但它没有在任何地方使用。我只是将它复制到内存中,运行不影响它的内核,然后复制回主机。在输出中,我看到它发生了变化。为什么?我做错了什么

在主功能中,您可以看到循环。在它中,我运行两个内核:sumKernel和mulKernel。在运行内核之前、内核之后以及同步线程之后,我将数组复制到主机并打印它。所以,在调用内核之后,我看到了错误的数据。请参阅代码中的注释


我没有看到任何错误,只有cudaSuccess。

哦,我找到了错误。我忘了在内核中使用ifidx您根本不检查CUDA错误。如果不检查错误,就永远不会知道是否出了问题。没有内核调用代码,我们只能猜测。它是如何改变的?你确定用正确的参数以正确的顺序调用了内核吗?你要求我们查看输出。但是您没有解释它应该是什么样子,代码做什么,或者在出现问题时使用的命令行参数值。如果你不努力向别人解释或者告诉他们你应该如何运行代码来产生问题,别人怎么知道你的代码应该做什么并帮助识别问题呢?我编辑了这篇文章。请参阅代码中的注释。