C++ 执行CUDA程序时出现分段错误_C++_C_Cuda_Segmentation Fault

C++ 执行CUDA程序时出现分段错误

c++ c cuda

C++ 执行CUDA程序时出现分段错误,c++,c,cuda,segmentation-fault,C++,C,Cuda,Segmentation Fault,我刚开始在NVIDIA CUDA上编程，在执行使用CUBLAS库的程序时遇到了“分段错误”。我已经安装了NVIDIA CUDA Toolkit 6.5 以下是我的代码： #include <stdio.h> #include <stdlib.h> #include <string.h> /* Includes, cuda */ #include <cuda_runtime.h> #include <cublas_v2.h> #incl

我刚开始在NVIDIA CUDA上编程，在执行使用CUBLAS库的程序时遇到了“分段错误”。我已经安装了NVIDIA CUDA Toolkit 6.5

以下是我的代码：

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* Includes, cuda */
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <helper_cuda.h>

int main(int argc, char **argv)
{
    cublasStatus_t status;
    float *h_A;
    float *h_B;
    float *h_C;
    float *d_A = 0;
    float *d_B = 0;
    float *d_C = 0;
    int n2 = 5;

    float *h_T;

    cublasHandle_t handle;

    int dev = findCudaDevice(argc, (const char **)argv);

    if (dev == -1)
    {
        return EXIT_FAILURE;
    }

    /* Initialize CUBLAS */
    printf("simpleCUBLAS test running..\n");

    status = cublasCreate(&handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! CUBLAS initialization error\n");
        return EXIT_FAILURE;
    }

    printf("Allocating A\n");

    /* Allocate host memory for the matrices */
    h_A = (float *)malloc(n2 * sizeof(h_A[0]));

    if (h_A == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (A)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated A\n");

    h_B = (float *)malloc(n2 * sizeof(h_B[0]));

    if (h_B == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (B)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated B\n");

    h_C = (float *)malloc(n2 * sizeof(h_C[0]));

    if (h_C == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated C\n");


    h_T = (float *)malloc(n2 * sizeof(h_T[0]));

    if (h_T == 0)
    {
        fprintf(stderr, "!!!! host memory allocation error (C)\n");
        return EXIT_FAILURE;
    }


    /* Fill the matrices with test data */
    int i;
    for (i = 0; i < n2; i++)
    {
        h_A[i] = i;
        h_B[i] = i;

        //h_A[i] = rand() / (float)RAND_MAX;
        //h_B[i] = rand() / (float)RAND_MAX;
        h_C[i] = 0;
    }

    printf("Filled A,, B, C\n");

    /* Allocate device memory for the matrices */
    if (cudaMalloc((void **)&d_A, n2 * sizeof(d_A[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated d_A\n");

    if (cudaMalloc((void **)&d_B, n2 * sizeof(d_B[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated d_B\n");

    if (cudaMalloc((void **)&d_C, n2 * sizeof(d_C[0])) != cudaSuccess)
    {
        fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
        return EXIT_FAILURE;
    }

    printf("Allocated d_C\n");

    status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write A)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write B)\n");
        return EXIT_FAILURE;
    }

    status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (write C)\n");
        return EXIT_FAILURE;
    }

    fprintf(stderr, "!!!! error test\n");
    printf("Vectors set.\n");

    status = cublasGetVector(n2, sizeof(h_T[0]), d_A, 1, h_T, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read T)\n");
        return EXIT_FAILURE;
    }

    int f;
    for (f = 0; f < n2; f++)
    {
        printf("T[%d]=%f\n", f, h_T[f]);
    }


    status = cublasSdot(handle, n2, d_A, 1, d_B, 1, d_C);

    printf("Dot product done.\n");

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! kernel execution error.\n");
        return EXIT_FAILURE;
    }


    status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! device access error (read C)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_A) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (A)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_B) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (B)\n");
        return EXIT_FAILURE;
    }

    if (cudaFree(d_C) != cudaSuccess)
    {
        fprintf(stderr, "!!!! memory free error (C)\n");
        return EXIT_FAILURE;
    }

    status = cublasDestroy(handle);

    if (status != CUBLAS_STATUS_SUCCESS)
    {
        fprintf(stderr, "!!!! shutdown error (A)\n");
        return EXIT_FAILURE;
    }



    return 0;
}

我想我可能以错误的方式调用了方法

cublasSDot（）

。请告诉我哪里错了

注意：我参考名为“simpleCUBLAS.cpp”的CUDA Toolkit 6.5示例文件创建了上述程序。我打印了数组'T'的值以测试方法'cublasGetVector'。

“cublasSDot”表示矢量点积。

我在cublasCreate（）之后添加了以下代码，从而解决了处理“分段错误”的问题

这将确保CUBLAS库知道输入是标量。

来源：

标准警告：请输入

malloc（）和family的返回值。h_A=（float*）malloc（n2*sizeof（h_A[0]）。在这一行中，在将内存分配给ha
之前，您正在访问ha[0]
。这将导致未定义的行为。对于h\u B
和h\u C
也一样。当然，会按照建议执行。@sgarizvisizeof（x）
不会生成对x的访问权限。不是UB@Sourav Ghosh，你的建议适用于C，而不是C++。@ RobertCrovella…你是对的。对不起，我弄错了，我在想C99。
GPU Device 0: "GRID K520" with compute capability 3.0

simpleCUBLAS test running..
Allocating A
Allocated A
Allocated B
Allocated C
Filled A,, B, C
Allocated d_A
Allocated d_B
Allocated d_C
!!!! error test
Vectors set.
T[0]=0.000000
T[1]=1.000000
T[2]=2.000000
T[3]=3.000000
T[4]=4.000000
Segmentation fault

cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);