Warning: file_get_contents(/data/phpspider/zhask/data//catemap/0/search/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Cuda 通用点积_Cuda_Dot Product - Fatal编程技术网

Cuda 通用点积

Cuda 通用点积,cuda,dot-product,Cuda,Dot Product,我对C和CUDA都是新手,我在写点积函数,但是它没有给我正确的结果。有没有善良的灵魂能帮我看看 我还有两个问题, 为什么dot()不能正常工作,以及 在第57行,为什么是product[threadIdx.x]而不是product[index]?我不能写吗 产品[指数]=a[指数]*b[指数]; ... 如果(索引==0){…} 然后用第零个线程对每个元素求和 非常感谢 设备查询: Device 0: "GeForce GTX 570" CUDA Driver Version / Run

我对C和CUDA都是新手,我在写点积函数,但是它没有给我正确的结果。有没有善良的灵魂能帮我看看

我还有两个问题,

  • 为什么dot()不能正常工作,以及
  • 在第57行,为什么是product[threadIdx.x]而不是product[index]?我不能写吗

    产品[指数]=a[指数]*b[指数]; ... 如果(索引==0){…} 然后用第零个线程对每个元素求和

  • 非常感谢

    设备查询:

      Device 0: "GeForce GTX 570"
      CUDA Driver Version / Runtime Version          6.0 / 5.5
      CUDA Capability Major/Minor version number:    2.0
    
    Makefile:nvcc-arch=sm_20 cuda_test.cu-o cuda_test

    在cuda_test.cu中:

    #include <stdio.h> // printf, scanf, NULL etc.
    #include <stdlib.h> // malloc, free, rand etc.
    
    #define N (3) //Number of threads we are using (also, length of array declared in main)
    
    #define THREADS_PER_BLOCK (1) //Threads per block we are using
    
    #define N_BLOCKS (N/THREADS_PER_BLOCK)
    
    /* Function to generate a random integer between 1-10 */
    void random_ints (int *a, int n)
    {
        int i;
        srand(time(NULL)); //Seed rand() with current time
        for(i=0; i<n; i++)
        { 
            a[i] = rand()%10 + 1; 
        }
        return;
    }
    
    /* Kernel that adds two integers a & b, stores result in c */
    __global__ void add(int *a, int *b, int *c) {
    //global indicates function that runs on 
    //device (GPU) and is called from host (CPU) code
    
        int index = threadIdx.x + blockIdx.x * blockDim.x;
    
        //threadIdx.x : thread index
        //blockIdx.x  : block index
        //blockDim.x  : threads per block
        //hence index is a thread counter across all blocks
        c[index] = a[index] + b[index];
    
    //note that pointers are used for variables
    //add() runs on device, so they must point to device memory
    //need to allocate memory on GPU
    }
    
    /* Kernel for dot product */
    __global__ void dot(int *a, int *b, int *c)
    {
        __shared__ int product[THREADS_PER_BLOCK]; //All threads in a block must be able 
                                                   //to access this array
    
        int index = threadIdx.x + blockIdx.x * blockDim.x; //index
    
        product[threadIdx.x] = a[index] * b[index]; //result of elementwise
                                                    //multiplication goes into product
    
        //Make sure every thread has finished
        __syncthreads();
    
        //Sum the elements serially to obtain dot product
        if( 0 == threadIdx.x ) //Pick one thread to sum, otherwise all will execute
        {
            int sum = 0;
            for(int j=0; j < THREADS_PER_BLOCK; j++) sum += product[j];
            //Done!
            atomicAdd(c,sum);
        }
    }
    
    int main(void)
    {
    
        int *a, *b, *c, *dotProduct; //host copies of a,b,c etc
        int *d_a, *d_b, *d_c, *d_dotProduct; //device copies of a,b,c etc
    
        int size = N * sizeof(int); //size of memory that needs to be allocated
    
        int i=0; //iterator
    
        //Allocate space for device copies of a,b,c
        cudaMalloc((void **)&d_a, size);
        cudaMalloc((void **)&d_b, size);
        cudaMalloc((void **)&d_c, size);
    
        //Setup input values
        a = (int *)malloc(size); random_ints(a,N);
        b = (int *)malloc(size); random_ints(b,N);
        c = (int *)malloc(size);
    
        //Copy inputs to device
        cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
        cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
    
        //Launch add() kernel on GPU
        add<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_c);
        // triple angle brackets mark call from host to device
        // this is also known as a kernel launch
        // N/THREADS_PER_BLOCK = NO. OF BLOCKS
    
        //Copy result back to host
        cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
    
        //Output results
        printf("a = {");
        for (i=0; i<N; i++) printf(" %d",a[i]);
        printf(" }\n");
    
        printf("b = {");
        for (i=0; i<N; i++) printf(" %d",b[i]);
        printf(" }\n");
    
        printf("c = {");
        for (i=0; i<N; i++) printf(" %d",c[i]);
        printf(" }\n");
    
        //Calculate dot product of a & b
        dotProduct = (int *)malloc(sizeof(int)); //Allocate host memory to dotProduct
        *dotProduct = 0; //initialise to zero
        cudaMalloc((void **)&d_dotProduct, sizeof(int)); //Allocate device memory to d_dotProduct
        dot<<<N_BLOCKS,THREADS_PER_BLOCK>>>(d_a, d_b, d_dotProduct); //Perform calculation
        cudaMemcpy(dotProduct, d_dotProduct, sizeof(int), cudaMemcpyDeviceToHost); //Copy result into dotProduct
        printf("\ndot(a,b) = %d\n", *dotProduct); //Output result
    
        //Cleanup
        free(a); free(b); free(c); free(dotProduct);
        cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); cudaFree(d_dotProduct);
    
        return 0;
    } //End of main
    
    #包括//printf、scanf、NULL等。
    #包括//malloc、free、rand等。
    #define N(3)//我们正在使用的线程数(还有main中声明的数组长度)
    #定义每个块的线程数(1)//我们正在使用的每个块的线程数
    #定义N个块(每个块有N个线程)
    /*函数生成1-10之间的随机整数*/
    无效随机整数(整数*a,整数n)
    {
    int i;
    srand(time(NULL));//使用当前时间设置rand()
    
    对于(i=0;i,正如Talonmes所说,请将其设置为其他人可以运行您的代码。嵌入行号是没有帮助的

    在没有其他信息的情况下,最好的猜测是您没有将
    d_dotProduct
    初始化为零。您可以使用
    cudaMemset()
    -如果您需要不同的初始值,那么您可以
    cudaMemcpy()
    从主机获取初始值或启动单独的内核进行初始化,但在这种情况下,
    cudaMemset()
    (相当于主机上的
    memset()
    )就足够了

    也可能是
    N个块*每个块的线程数
    不等于
    size


    至于你的第二个问题,
    product
    是一个大小为
    THREADS\u per\u block
    的每块数组,如果你使用
    product[index]
    访问它,你将无法访问它。

    问题已解决!需要在对“product”数组的各个元素求和之前设置“*c=0”

    /* Kernel for dot product */
    __global__ void dot(int *a, int *b, int *c)
    {
        __shared__ int product[THREADS_PER_BLOCK]; //All threads in a block must be able 
                                                   //to access this array
    
        int index = threadIdx.x + blockIdx.x * blockDim.x; //index
    
        product[threadIdx.x] = a[index] * b[index]; //result of elementwise
                                                    //multiplication goes into product
    
        if(index==0) *c = 0; //Ask one thread to set c to zero.
    
        //Make sure every thread has finished
        __syncthreads();    
    
        //Sum the elements serially to obtain dot product
        if( 0 == threadIdx.x ) //Every block to do c += sum
        {
            int sum = 0;
            for(int j=0; j < THREADS_PER_BLOCK; j++) sum += product[j];
            //Done!
            atomicAdd(c,sum);
        }
    }
    
    /*用于dot产品的内核*/
    __全局无效点(int*a,int*b,int*c)
    {
    __shared_uuuint product[THREADS_PER_BLOCK];//块中的所有线程都必须能够
    //要访问此阵列
    int index=threadIdx.x+blockIdx.x*blockDim.x;//索引
    product[threadIdx.x]=a[index]*b[index];//elementwise的结果
    //乘法变成乘积
    if(index==0)*c=0;//要求一个线程将c设置为零。
    //确保每根线都已完成
    __同步线程();
    //将元素按顺序求和以获得点积
    if(0==threadIdx.x)//要执行的每个块c+=sum
    {
    整数和=0;
    对于(int j=0;j
    请发布没有嵌入行号的代码,其他人可以根据自己的意愿编译和运行这些代码。此外,请解释什么“工作不正常”意思正是。嗨,Talonmes,对不起!我已经清理了它,现在应该可以编译了。很抱歉早些时候浪费了你的时间!嗨,tom,谢谢你回答问题2!我已经清理了代码并完整地包含了它。很抱歉早些时候浪费了你的时间!这是错误的,只有当你启动一个threadblock时它才会工作。如果你启动多个threadblock,那么每个块的线程0会将最终结果重置为零。仅在块0中执行此操作也是不正确的,因为编程模型允许块以任何顺序执行。正确的解决方案是在启动内核之前将d_dotProduct设置为零。现在呢?我添加了if(index==0)*c=0行。当d_dotProduct仅存在于GPU内存中时,如何从主机代码将其设置为零?您肯定必须从内核中进行设置?另一方面,非常感谢您的帮助,tom。正如我上面所说,您不能只在块零中进行设置,因为块可以以任何顺序执行,例如块1可以在块之前写入其结果0将结果(覆盖块1的结果)重置为零。正确的方法是在启动内核之前将初始值设置为零-最明显的方法是使用
    cudaMemset()
    ,但也可以使用
    cudaMemcpy()
    复制初始值(在本例中为零)来自主持人。我将在我的答案中添加how部分。