Cuda 内核内的求和数组不';行不通
在下面的代码中,将数据加载到共享数组后,我将尝试对共享内存中的数组求和。加载的数组大小是289,下面是我的内核和mainCuda 内核内的求和数组不';行不通,cuda,sum,Cuda,Sum,在下面的代码中,将数据加载到共享数组后,我将尝试对共享内存中的数组求和。加载的数组大小是289,下面是我的内核和main #include <cuda.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include<iostream> #include <stdio.h> //#include "readmat.cuh" //#include "mat.h" #includ
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<iostream>
#include <stdio.h>
//#include "readmat.cuh"
//#include "mat.h"
#include <device_functions.h>
#include <time.h>
#include <ctime>
//#include "opencv2/highgui/highgui.hpp"
using namespace std;
//using namespace cv;
typedef struct {
size_t X;
size_t Y;
size_t U;
size_t V;
double* elements;
int no_of_elements;
int alpha;
} DataIn;
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);
__device__ double getelementData(DataIn data, int x, int v, int u, int y)
{
int index = data.X*data.Y*data.U*(v)+data.X*data.Y*(u)+data.X*(y)+x;
return data.elements[index];
}
__global__ void universaladd(DataIn data,int alpha,double* imagedevice)
{
int v = blockIdx.y;
int u = blockIdx.x;
int y = threadIdx.y;
int x = threadIdx.x;
/*
if (x == 0 && y == 0)
{
if (u == 0)
printf(" tooooooooooooooooootal the vale got whenn u=%d v=%d is %f \n", u, v);
if (v == 0)
printf(" tooooooooooooooooootal the vale got whenn u=%d v=%d is %f \n", u, v);
}
*/
double temp;
int local_idx = (blockDim.y*threadIdx.x) + threadIdx.y;
extern __shared__ double matrix[];
int m = alpha - 1;
int Y_shift = y*m;
int X_shift = x*m;
if (v < Y_shift && u < X_shift){
matrix[data.Y*x + y] = getelementData(data, x, (data.V - (Y_shift - v)), (data.U - (X_shift - u)), y);
temp = getelementData(data, x, (data.V - (Y_shift - v)), (data.U - (X_shift - u)), y);
//printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
}
else if (v >= Y_shift && u < X_shift){
matrix[data.Y*x + y] = getelementData(data, x, (v - (Y_shift)), (data.U - (X_shift - u)), y);
temp = getelementData(data, x, (v - (Y_shift)), (data.U - (X_shift - u)), y);
//printf("the vale got when y=%d x=%d u=%d v=%d is %f\n ", y, x, u, v, temp);
}
else if (v < Y_shift && u >= X_shift){
matrix[data.Y*x + y] = getelementData(data, x, (data.V - (Y_shift - v)), (u - (X_shift)), y);
temp = getelementData(data, x, (data.V - (Y_shift - v)), (u - (X_shift)), y);
//printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
}
else if (v >= Y_shift && u >= X_shift){
matrix[data.Y*x + y] = getelementData(data, x, (v - (Y_shift)), (u - (X_shift)), y);
temp = getelementData(data, x, (v - (Y_shift)), (u - (X_shift)), y);
//printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
}
//if((u==0 && v==0) && (x>15 && y>15)) // for testing
//printf("the vale got when y=%d x=%d u=%d v=%d is %f \n", y, x, u, v, temp);
//printf("Maaaaaa the vale got when y=%d x=%d u=%d v=%d is %.1f \n", y, x, u, v, matrix[0]);
__syncthreads;
//++++++++++++++++++++++++++++++++++++++++++++++++++++++
/*
//Section 1 for testing
if (x == 0 && y == 0)
{
for (int m = (data.X*data.Y) - 1; m > 0; m--)
{
matrix[0] += matrix[m];
}
}
*/
/*
if (x == 0 && y == 0)
{
if (u < 5 && v <5)
printf(" tooooooooooooooooootal the vale got whenn u=%d v=%d is %f \n", u, v, matrix[data.Y*x + y]);
}
*/
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//__syncthreads;
///*
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//sectiion 2
int different = (data.X*data.Y) - 256;
if (local_idx < different-1)
{
printf("the value when x=%d y=%d u=%d v=%d %d %f to val %f is \n",x,y,u,v, local_idx, matrix[256 + local_idx], matrix[local_idx]);
matrix[local_idx] += matrix[256 + local_idx];
}
__syncthreads;
if (local_idx < 128)
{
matrix[local_idx] += matrix[local_idx + 128];
}
__syncthreads;
if (local_idx < 64)
{
matrix[local_idx] += matrix[local_idx + 64];
}
__syncthreads;
if (local_idx < 32) {
matrix[local_idx] += matrix[local_idx + 32];
matrix[local_idx] += matrix[local_idx + 16];
matrix[local_idx] += matrix[local_idx + 8];
matrix[local_idx] += matrix[local_idx + 4];
matrix[local_idx] += matrix[local_idx + 2];
matrix[local_idx] += matrix[local_idx + 1];
}
__syncthreads;
if (local_idx == 0)
imagedevice[data.V*u + v] = matrix[local_idx];
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
__syncthreads;
//*/
/*
if (x == 0 && y == 0)
{
if (5>u && 5>v)
printf(" tooooooooooooooooootal the vale got whenn u=%d v=%d is %f \n", u, v, matrix[0]);
}
*/
}
/*
__global__ void intergershift(DataIn data)
{
int v = threadIdx.x;
int u = threadIdx.y;
int y = blockIdx.x;
int Height = blockDim.y;
extern __shared__ double* dataShared[];
double *** dataPoint = (double***)&dataShared;
double *** dataPointShifted = (double***)&dataShared[data.V];
double ** dataElements = (double**)&dataShared[2 * data.V];
dataPoint[v] = &dataShared[2 * data.V + Height*v];
}*/
int main()
{
clock_t begin = clock();
time_t start, end;
int elements;
int numberofdimension;
//const char *file = "Bracelet.mat";
//const size_t* dimepointer;
//readmat thismat(file);
//numberofdimension = thismat.getnumbrofdimensions();
//dimepointer = thismat.dimensionpointer();
size_t X, Y, U, V;
X = 17;
Y = 17;
U = 512;
V = 320;
// Dimensions end
DataIn data;
data.U = U;
data.Y = Y;
data.X = X;
data.V = V;
size_t size = X*Y*U*V*sizeof(double);
data.no_of_elements = X*Y*U*V;
cudaError_t status;
double * dataarray = new double[X*Y*U*V];
for (int k = 0; k < U*V*X*Y; k++)
dataarray[k] = 225;
short * Device_data;
cout << "the size is" << sizeof(Device_data) << endl;
status = cudaSetDevice(0);
status = cudaMalloc((void**)&data.elements, size);
status = cudaMemcpy(data.elements, dataarray, size, cudaMemcpyHostToDevice);
if (status != cudaSuccess) {
fprintf(stderr, "Memory copyind original data failed");
cudaFree(data.elements);
//cudaFree(arrangeddata);
}
/*
for (int t = 0; t < 10; t++)
cout << *(thismat.getarraypointer()+t)<<" ";
cout << endl << "original data printed" << endl;;
*/
double* image = new double[U*V];
double *imagedevice;
status = cudaMalloc((void**)&imagedevice, sizeof(double)*U*V);
//__global__ void universaladd(DataIn data,int alpha,double* imagedevice)
dim3 dimBlock(data.X, data.Y);
dim3 dimGrid(data.U, data.V);
universaladd << <dimGrid, dimBlock, sizeof(double)*X*Y >> >(data, 1, imagedevice);
status = cudaGetLastError();
fprintf(stderr, "Launch status: %s\n", cudaGetErrorString(status));
status = cudaDeviceSynchronize();
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching\n", status);
status = cudaMemcpy(image, imagedevice, sizeof(double)*U*V, cudaMemcpyDeviceToHost);
if (status != cudaSuccess) {
fprintf(stderr, "Memory copyind original data failed");
cudaFree(data.elements);
}
//cudaFree(arrangeddata);
for (int t = 0; t < 15; t++)
cout << image[t] << " ";
cout << endl << "data printed device";
int k;
std::cin >> k;
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
在这里,从设备到主机的内存复制失败,我认为这是由于内核在完成任务之前停止了。然而,当我对照matlab检查内核中打印的值时,它们是正确的
[更新]这里增加WDDM TDR延迟后,循环运行良好,数值正确,但第2节仍然没有给出所需的结果。由于第2节运行良好,仅使用1D块和1xsize线程启动的阵列,我认为问题与启动配置与减少不兼容有关
当使用阵列缩减启动时,第1节将被注释
Launch status: no error
cudaDeviceSynchronize returned error code 0 after launching
65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025 65025
data printed device
Launch status: no error
cudaDeviceSynchronize returned error code 0 after launching
85275 7650 27675 58500 27450 103050 30375 17775 18000 12825 24750 95625 15975 68175 7425
这里没有关于将映像从设备复制到主机的错误
[编辑]
这是带有main的内核。之前我从mat文件加载数据,现在我已经用225值填充了数据。最后我发现了问题。问题是由于缺乏同步。我打电话给uuu syncthreads;以错误的方式。它应该是\uuu syncthreads(),而不是同步线程 没有@talonmies谁也帮不了你谢谢你指出这一点。我已经更新了我的问题。我看不出你的编辑有什么帮助。您需要生成其他人可以编译和运行的最短、最简单的示例。@talomies我已经添加了主函数。问题是我从mat文件导入数据。