CUDA 6.0中的错误结果_Cuda - Fatal编程技术网

CUDA 6.0中的错误结果

cuda

CUDA 6.0中的错误结果,cuda,Cuda,我正在用CUDA6.0C/C++编写一个示例程序。程序可以识别设备，但在运行期间似乎有一个错误：结果数组的元素都等于0，没有任何原因。（我的GPU:Geforce EN9400GT ASUS）这是我的密码 #include <stdio.h> #include <malloc.h> #include <cuda_runtime.h> #define SIZE 1024 __global__ void

我正在用CUDA6.0C/C++编写一个示例程序。程序可以识别设备，但在运行期间似乎有一个错误：结果数组的元素都等于0，没有任何原因。（我的GPU:Geforce EN9400GT ASUS）这是我的密码

     #include <stdio.h>
     #include <malloc.h>
     #include <cuda_runtime.h>
     #define    SIZE 1024

     __global__ void VectorAdd(int* a, int* b, int* c, int n)
     {
        int i = threadIdx.x;

    if (i < n) {
        c[i] = a[i] + b[i];
    }
}

void printResult(int* ar) {
    for (int i = 0; i < 10; i++) {
        printf("[%d] = %d\n", i, ar[i]);
    }
}

int main() {
    int *a, *b, *c;
    int *d_a, *d_b, *d_c;
    int device, count;
    cudaDeviceProp* prop = (cudaDeviceProp*)malloc(sizeof(cudaDeviceProp));

    int GPUavail = cudaGetDeviceCount(&count);
    if (GPUavail != cudaSuccess) {
        printf("There is no GPU device available\n");
        exit(EXIT_FAILURE);
    }

    cudaGetDeviceProperties(prop, device);
    printf("Device name: %s\n", prop->name);
    printf("Global memory: %zd\n", prop->totalGlobalMem);
    printf("Shared memory: %zd\n", prop->sharedMemPerBlock);
    printf("Max threads per block: %d\n", prop->maxThreadsPerBlock);
    printf("Device ID: %d\n", prop->pciDeviceID);
    printf("TCC Driver: %d\n", prop->tccDriver);

    a = (int*)malloc(SIZE * sizeof(int));
    b = (int*)malloc(SIZE * sizeof(int));
    c = (int*)malloc(SIZE * sizeof(int));

    cudaMalloc(&d_a, SIZE*sizeof(int));
    cudaMalloc(&d_b, SIZE*sizeof(int));
    cudaMalloc(&d_c, SIZE*sizeof(int));

    for (int i = 0; i < SIZE; i++) {
        a[i] = i;
        b[i] = i;
        c[i] = 0;
    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);

    VectorAdd << < 1, SIZE >> >(d_a, d_b, d_c, SIZE);

    cudaMemcpy(c, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    printResult(c);

    free(a);
    free(b);
    free(c);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
}

#包括
#包括
#包括
#定义大小1024
__全局无效向量添加（int*a，int*b，int*c，int n）
{
int i=threadIdx.x；
if（i名称）；
printf（“全局内存：%zd\n”，prop->totalGlobalMem）；
printf（“共享内存：%zd\n”，属性->共享内存锁）；
printf（“每个块的最大线程数：%d\n”，prop->maxThreadsPerBlock）；
printf（“设备ID:%d\n”，prop->pciDeviceID）；
printf（“TCC驱动程序：%d\n”，道具->TCC驱动程序）；
a=（int*）malloc（SIZE*sizeof（int））；
b=（int*）malloc（SIZE*sizeof（int））；
c=（int*）malloc（SIZE*sizeof（int））；
Cudamaloc（&d_a，尺寸*sizeof（int））；
Cudamaloc（&d_b，尺寸*sizeof（int））；
Cudamaloc（&d_c，尺寸*sizeof（int））；
对于（int i=0；i>（d_a、d_b、d_c、大小）；
cudaMemcpy（c，d_c，SIZE*sizeof（int），cudaMemcpyDeviceToHost）；
打印结果（c）；
免费（a）；
免费（b）；
免费（c）；
库达弗里（杜阿）；
库达弗里（杜布）；
库达弗里（d_c）；
}

资料来源：

这是显示的结果：

正如程序输出中所述，您的GPU每个块最多只能启动512个线程。（

Max threads per block

）但是，您正在一个块中启动1024个线程。因为您使用无效的启动配置启动内核，所以您的内核根本没有启动。您应该更改块中的线程数

#define SIZE 512

计算能力>=2.0时，每个块的线程数限制为1024，但您的GPU计算能力为1.0。

将大小设为512，然后重试。然后再看看你发布的所有内容，以及你自己为什么它有效请将文本消息显示为文本，而不是图形。它们是文字，不是绘画作品。我没有发现这样一个小错误，现在它对我有用。