Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/141.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ 使用cudaMemcpy时出现分段错误_C++_Vector_Cuda - Fatal编程技术网

C++ 使用cudaMemcpy时出现分段错误

C++ 使用cudaMemcpy时出现分段错误,c++,vector,cuda,C++,Vector,Cuda,我试图使用cudaMemcpy将std::vector::data转换为设备内核的数组,它给出了set fault error。我的做法是: cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice); 其中vx是向量。下面是完整的示例。任何关于问题所在的提示都将不胜感激 #include <iostream> #include <math.h> #include <vector>

我试图使用cudaMemcpy将std::vector::data转换为设备内核的数组,它给出了set fault error。我的做法是:

  cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
其中vx是向量。下面是完整的示例。任何关于问题所在的提示都将不胜感激

#include <iostream>
#include <math.h>
#include <vector>

using namespace std;

// Kernel function to add the elements of two arrays
__global__
void add(int n, float *x, float *y)
{
    int i = blockIdx.x*blockDim.x + threadIdx.x;
    if(i < n) {
        y[i] = x[i] + y[i];
    }
}


int main(void)
{
    int N = 1<<10;
    float *d_x = NULL, *d_y = NULL;
    cudaMalloc((void **)&d_x, sizeof(float)*N);
    cudaMalloc((void **)&d_y, sizeof(float)*N);

    // Allocate Unified Memory – accessible from CPU or GPU
    vector<float> vx;
    vector<float> vy;

    // initialize x and y arrays on the host
    for (int i = 0; i < N; i++) {
        vx.push_back(1.0f);
        vy.push_back(2.0f);
    }

    cudaMemcpy(d_x, vx.data(), N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_y, vy.data(), N*sizeof(float), cudaMemcpyHostToDevice);
    //
    int blockSize;   // The launch configurator returned block size
    int minGridSize; // The minimum grid size needed to achieve the
    // maximum occupancy for a full device launch
    int gridSize;    // The actual grid size needed, based on input size

    cudaOccupancyMaxPotentialBlockSize( &minGridSize, &blockSize, add, 0, N);
    // Round up according to array size
    gridSize = (N + blockSize - 1) / blockSize;

    cout<<"blockSize: "<<blockSize<<" minGridSize: "<<minGridSize<<" gridSize: "<<gridSize<<endl;

    // calculate theoretical occupancy
    int maxActiveBlocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor( &maxActiveBlocks, add, blockSize, 0);

    int device;
    cudaDeviceProp props;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&props, device);

    float occupancy = (maxActiveBlocks * blockSize / props.warpSize) /
        (float)(props.maxThreadsPerMultiProcessor /
                props.warpSize);

    printf("Launched blocks of size %d. Theoretical occupancy: %f\n",
            blockSize, occupancy);


    // Run kernel on 1M elements on the GPU
    add<<<gridSize, blockSize>>>(N, d_x, d_y);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Check for errors (all values should be 3.0f)
    float maxError = 0.0f;

    for (int i = 0; i < N; i++) {
        maxError = fmax(maxError, fabs(d_y[i]-3.0f));
    }
    std::cout << "Max error: " << maxError << std::endl;

    // Free memory
    cudaFree(d_x);
    cudaFree(d_y);

    return 0;
}


blockSize: 1024 minGridSize: 16 gridSize: 1
Launched blocks of size 1024. Theoretical occupancy: 1.000000
Segmentation fault (core dumped)
问题在于:

for (int i = 0; i < N; i++) {
   maxError = fmax(maxError, fabs(d_y[i]-3.0f));
                                  ^^^^^^
}
原因是您无法取消主机上的设备指针引用

解决方案是将设备内存复制到主机,类似于主机到设备所做的操作。

问题在于:

for (int i = 0; i < N; i++) {
   maxError = fmax(maxError, fabs(d_y[i]-3.0f));
                                  ^^^^^^
}
原因是您无法取消主机上的设备指针引用

解决方案是将设备内存复制到主机,类似于主机到设备所做的操作