Cuda 内核内的求和数组不';行不通

Cuda 内核内的求和数组不';行不通,cuda,sum,Cuda,Sum,在下面的代码中,将数据加载到共享数组后,我将尝试对共享内存中的数组求和。加载的数组大小是289,下面是我的内核和main #include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include<iostream> #include <stdio.h> //#include "readmat.cuh" //#include "mat.h" #includ

在下面的代码中,将数据加载到共享数组后,我将尝试对共享内存中的数组求和。加载的数组大小是289,下面是我的内核和main

#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include <stdio.h>
//#include "readmat.cuh"
//#include "mat.h"
#include <device_functions.h>
#include <time.h>
#include <ctime>
//#include "opencv2/highgui/highgui.hpp"
using namespace std;
//using namespace cv;

typedef struct {
    size_t X;
    size_t Y;
    size_t U;
    size_t V;
    double* elements;
    int no_of_elements;
    int alpha;
} DataIn;


cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);




__device__ double getelementData(DataIn data, int x, int v, int u, int y)
{
    int index = data.X*data.Y*data.U*(v)+data.X*data.Y*(u)+data.X*(y)+x;
    return data.elements[index];
}

__global__ void universaladd(DataIn data,int alpha,double* imagedevice)
{
    int v = blockIdx.y;
    int u = blockIdx.x;
    int y = threadIdx.y;
    int x = threadIdx.x;


    /*
    if (x == 0 && y == 0)
    {
        if (u == 0)
            printf(" tooooooooooooooooootal       the vale got whenn u=%d  v=%d  is %f \n", u, v);
        if (v == 0)
            printf(" tooooooooooooooooootal       the vale got whenn u=%d  v=%d  is %f \n", u, v);

    }
    */
    double temp;
    int local_idx = (blockDim.y*threadIdx.x) + threadIdx.y;

    extern __shared__ double matrix[];

    int m = alpha - 1;
    int Y_shift = y*m;
    int X_shift = x*m;

    if (v < Y_shift && u < X_shift){
        matrix[data.Y*x + y] = getelementData(data, x, (data.V - (Y_shift - v)), (data.U - (X_shift - u)), y);
        temp = getelementData(data, x, (data.V - (Y_shift - v)), (data.U - (X_shift - u)), y);
        //printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);

    }


    else if (v >= Y_shift && u < X_shift){
        matrix[data.Y*x + y] = getelementData(data, x, (v - (Y_shift)), (data.U - (X_shift - u)), y);
        temp = getelementData(data, x, (v - (Y_shift)), (data.U - (X_shift - u)), y);
        //printf("the vale got when y=%d x=%d u=%d v=%d is %f\n ", y, x, u, v, temp);
    }

    else if (v < Y_shift &&  u >= X_shift){
        matrix[data.Y*x + y] = getelementData(data, x, (data.V - (Y_shift - v)), (u - (X_shift)), y);
        temp = getelementData(data, x, (data.V - (Y_shift - v)), (u - (X_shift)), y);
        //printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
    }

    else if (v >= Y_shift && u >= X_shift){
        matrix[data.Y*x + y] = getelementData(data, x, (v - (Y_shift)), (u - (X_shift)), y);
        temp = getelementData(data, x, (v - (Y_shift)), (u - (X_shift)), y);
        //printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
    }

    //if((u==0 && v==0) && (x>15 && y>15)) // for testing
    //printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
    //printf("Maaaaaa the vale got when y=%d x=%d u=%d v=%d is %.1f \n", y, x, u, v, matrix[0]);

    __syncthreads;
    //++++++++++++++++++++++++++++++++++++++++++++++++++++++
    /*
    //Section 1 for testing
    if (x == 0 && y == 0)
    {
        for (int m = (data.X*data.Y) - 1; m > 0; m--)
        {
            matrix[0] += matrix[m];
        }
    }
    */
    /*
    if (x == 0 && y == 0)
    {
        if (u < 5 && v <5)
            printf(" tooooooooooooooooootal       the vale got whenn u=%d  v=%d  is %f \n", u, v, matrix[data.Y*x + y]);

    }
    */
    //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    //__syncthreads;
    ///*
    //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    //sectiion 2 
    int different = (data.X*data.Y) - 256;
    if (local_idx < different-1)
    {
        printf("the value when x=%d y=%d u=%d v=%d  %d      %f to val %f is \n",x,y,u,v, local_idx, matrix[256 + local_idx], matrix[local_idx]);
        matrix[local_idx] += matrix[256 + local_idx];
    }


    __syncthreads;



    if (local_idx < 128)
    {
        matrix[local_idx] += matrix[local_idx + 128];

    }
    __syncthreads;
    if (local_idx < 64)
    {
        matrix[local_idx] += matrix[local_idx + 64];

    }
    __syncthreads;
    if (local_idx < 32) {
        matrix[local_idx] += matrix[local_idx + 32];
        matrix[local_idx] += matrix[local_idx + 16];
        matrix[local_idx] += matrix[local_idx + 8];
        matrix[local_idx] += matrix[local_idx + 4];
        matrix[local_idx] += matrix[local_idx + 2];
        matrix[local_idx] += matrix[local_idx + 1];
    }

    __syncthreads;
    if (local_idx == 0)
        imagedevice[data.V*u + v] = matrix[local_idx];

    //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
    __syncthreads;
    //*/
    /*
    if (x == 0 && y == 0)
    {
        if (5>u && 5>v)
            printf(" tooooooooooooooooootal       the vale got whenn u=%d  v=%d  is %f \n", u, v, matrix[0]);

    }
    */
}

/*
__global__ void intergershift(DataIn data)
{
int v = threadIdx.x;
int u = threadIdx.y;
int y = blockIdx.x;

int Height = blockDim.y;
extern __shared__ double* dataShared[];
double *** dataPoint = (double***)&dataShared;
double *** dataPointShifted = (double***)&dataShared[data.V];
double ** dataElements = (double**)&dataShared[2 * data.V];
dataPoint[v] = &dataShared[2 * data.V + Height*v];


}*/
int main()
{
    clock_t begin = clock();
    time_t start, end;
    int elements;
    int numberofdimension;

    //const char *file = "Bracelet.mat";
    //const size_t* dimepointer;



    //readmat thismat(file);
    //numberofdimension = thismat.getnumbrofdimensions();
    //dimepointer = thismat.dimensionpointer();

    size_t X, Y, U, V;
    X = 17;
    Y = 17;
    U = 512;
    V = 320;
    // Dimensions end
    DataIn data;
    data.U = U;
    data.Y = Y;
    data.X = X;
    data.V = V;
    size_t size = X*Y*U*V*sizeof(double);
    data.no_of_elements = X*Y*U*V;
    cudaError_t status;
    double * dataarray = new double[X*Y*U*V];
    for (int k = 0; k < U*V*X*Y; k++)
        dataarray[k] = 225;
    short * Device_data;
    cout << "the size is" << sizeof(Device_data) << endl;
    status = cudaSetDevice(0);
    status = cudaMalloc((void**)&data.elements, size);
    status = cudaMemcpy(data.elements, dataarray, size, cudaMemcpyHostToDevice);
    if (status != cudaSuccess) {
        fprintf(stderr, "Memory copyind original data failed");
        cudaFree(data.elements);
        //cudaFree(arrangeddata);

    }

    /*
    for (int t = 0; t < 10; t++)
    cout << *(thismat.getarraypointer()+t)<<" ";
    cout << endl << "original data printed" << endl;;
    */
    double*  image = new double[U*V];
    double *imagedevice;
    status = cudaMalloc((void**)&imagedevice, sizeof(double)*U*V);

    //__global__ void universaladd(DataIn data,int alpha,double* imagedevice)
    dim3 dimBlock(data.X, data.Y);
    dim3 dimGrid(data.U, data.V);
    universaladd << <dimGrid, dimBlock, sizeof(double)*X*Y >> >(data, 1, imagedevice);


    status = cudaGetLastError();
    fprintf(stderr, "Launch status: %s\n", cudaGetErrorString(status));

    status = cudaDeviceSynchronize();
    fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching\n", status);




    status = cudaMemcpy(image, imagedevice, sizeof(double)*U*V, cudaMemcpyDeviceToHost);
    if (status != cudaSuccess) {
        fprintf(stderr, "Memory copyind original data failed");
        cudaFree(data.elements);
    }
        //cudaFree(arrangeddata);
        for (int t = 0; t < 15; t++)
            cout << image[t] << " ";
        cout << endl << "data printed device";
    int k;
    std::cin >> k;
    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
在这里,从设备到主机的内存复制失败,我认为这是由于内核在完成任务之前停止了。然而,当我对照matlab检查内核中打印的值时,它们是正确的

[更新]这里增加WDDM TDR延迟后,循环运行良好,数值正确,但第2节仍然没有给出所需的结果。由于第2节运行良好,仅使用1D块和1xsize线程启动的阵列,我认为问题与启动配置与减少不兼容有关

当使用阵列缩减启动时,第1节将被注释

    Launch status: no error
cudaDeviceSynchronize returned error code 0 after launching
65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025
data printed device
Launch status: no error
cudaDeviceSynchronize returned error code 0 after launching
85275 7650 27675 58500 27450 103050 30375 17775 18000 12825 24750 95625 15975 68175 7425
这里没有关于将映像从设备复制到主机的错误

[编辑]


这是带有main的内核。之前我从mat文件加载数据,现在我已经用225值填充了数据。

最后我发现了问题。问题是由于缺乏同步。我打电话给uuu syncthreads;以错误的方式。它应该是\uuu syncthreads(),而不是同步线程

没有@talonmies谁也帮不了你谢谢你指出这一点。我已经更新了我的问题。我看不出你的编辑有什么帮助。您需要生成其他人可以编译和运行的最短、最简单的示例。@talomies我已经添加了主函数。问题是我从mat文件导入数据。