CUDA/C矩阵乘法_C_Cuda_Matrix Multiplication

CUDA/C矩阵乘法

c cuda

CUDA/C矩阵乘法,c,cuda,matrix-multiplication,C,Cuda,Matrix Multiplication,有人能告诉我我做错了什么吗？我正在尝试创建一个程序，使用cuda将矩阵返回给电源。似乎cudaMemcpy（ln103）没有返回结果数组。我通过返回矩阵中的第一个元素来检查它，但我总是得到0。也许我的内核有问题？如有任何帮助，将不胜感激：编辑：我应该澄清一下，内核是迭代的（从矩阵乘以各自的单位矩阵开始，然后再乘以之后的每个结果），直到k次，这就给了矩阵幂 i、 e.A是一个矩阵 A^0=I（单位矩阵） A^k=A^（k-1）*A 输入： <n> <power> <

有人能告诉我我做错了什么吗？我正在尝试创建一个程序，使用cuda将矩阵返回给电源。似乎cudaMemcpy（ln103）没有返回结果数组。我通过返回矩阵中的第一个元素来检查它，但我总是得到0。也许我的内核有问题？如有任何帮助，将不胜感激：

编辑：我应该澄清一下，内核是迭代的（从矩阵乘以各自的单位矩阵开始，然后再乘以之后的每个结果），直到k次，这就给了矩阵幂

i、 e.A是一个矩阵 A^0=I（单位矩阵） A^k=A^（k-1）*A

输入：

<n>
<power>
<element>
.....

新代码：

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/resource.h>

#define BLOCK 8
#define SIZE (BLOCK * 64)
#define TILE_SIZE (8)

int n;


float *
create_matrix_h(unsigned int w, unsigned int h) {
  float *m;
  m = (float *) malloc(w * h * sizeof(float));
  if (m == NULL) {
    fprintf(stderr, "Failed to malloc.\n");
    exit(1);
  }
  return m;
}

void
print_matrix(const float *m, const int w, const int h) {
  int x, y;
  for (y = 0; y != h; ++y) {
    for (x = 0; x != w; ++x)
      printf("%.03f ", m[y*w + x]);
    printf("\n");
  }
}


void
cpu_mult(const float *m1, const float *m2, float *m3, unsigned int width) {
  unsigned int i, j, k;
  float result;

  for (i = 0; i != width; ++i) {
    for (j = 0; j != width; ++j) {
      result = 0;
      for (k = 0; k != width; ++k)
        result += m1[i*width + k] * m2[k*width + j];
      m3[i*width + j] = result;
    }
  }
}


__global__ void
kernel3(const float *m1, const float *m2, float *m3, unsigned int width) {
  const unsigned int row = blockIdx.y*blockDim.y + threadIdx.y;
  const unsigned int col = blockIdx.x*blockDim.x + threadIdx.x;
  unsigned int t, i;
  float result = 0, a, b;

  for (t = 0; t < width / TILE_SIZE; ++t) {
    for (i = 0; i != TILE_SIZE; ++i) {
      a = m1[row*width + t*TILE_SIZE + i];
      b = m2[(t*TILE_SIZE + i)*width + col];
      result += a * b;
    }
    __syncthreads();
  }
  m3[row*width + col] = result;
}

float *
create_matrix_d(int w, int h) {
  float *m;
  if (cudaMalloc(&m, w * h * sizeof(float)) == cudaErrorMemoryAllocation) {
    fprintf(stderr, "Failed to cudaMalloc.\n");
    return NULL;
    //exit(1);
  }
  return m;
}

void
fill_matrix_h(float *const m, int w, int h, float *const values, int nvalues) {
  int i, j = 0;
  for (i = 0; i != w * h; ++i) {
    m[i] = values[j];
    j = (j + 1) % nvalues;
  }
}

int
main(void) {
    int k;
    if (scanf("%d", &n) !=1 || n<1){
        return 0;
    }
    if (scanf(" %d", &k) !=1 || k<0){
        return 0;
    }
    float *hm[3], *dm[3];
    dim3 bdim(TILE_SIZE, TILE_SIZE);
    dim3 gdim(SIZE/TILE_SIZE, SIZE/TILE_SIZE);
    int i;
    for(i=0; i<3; ++i) {
        hm[i] = create_matrix_h(SIZE, SIZE);
        dm[i] = create_matrix_d(SIZE, SIZE);
    }
    float tem[n*n];
    for(i=0; i<n*n; ++i) {
        if (scanf(" %f", &tem[i]) !=1){
            return 0;
        }
    }
    float temid[n*n];
    int j = 0;
    for (i = 0; i != n*n; ++i) {
        if (i==0 || j == n) { // not j + (n+1)
            temid[i] = 1;
            j=0;
        }
        else {
            temid[i] = 0;
            j++;
        }
    }
    fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
    fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
    cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
    dm[1] = dm[0]; // For the first iteration Result = A * A;
    int w;
    if (k==0) {
        hm[2] = hm[1];
    }
    else if (k==1) {
        hm[2] = hm[0];
    }
    else {
        for (w=1; w<k; ++w) {
           kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
           cudaThreadSynchronize();
           // No need to copy back to host
           // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
           // Copy between device pointers
           cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice); 
        }
        cudaMemcpy(hm[2], dm[1], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost); 
    }



    print_matrix(hm[2], n, n);

    return 0;

 }

#包括
#包括
#包括
#包括
#定义块8
#定义大小（块*64）
#定义瓷砖大小（8）
int n；
浮动*
创建矩阵（无符号整数w，无符号整数h）{
浮动*m；
m=（浮动*）malloc（w*h*sizeof（浮动））；
如果（m==NULL）{
fprintf（stderr，“malloc失败。\n”）；
出口（1）；
}
返回m；
}
无效的
打印矩阵（常量浮点*m，常量整数w，常量整数h）{
int x，y；
对于（y=0；y！=h；++y）{
对于（x=0；x！=w；++x）
printf（“%.03f”，m[y*w+x]）；
printf（“\n”）；
}
}
无效的
cpu_mult（常数浮点*m1，常数浮点*m2，浮点*m3，无符号整数宽度）{
无符号整数i，j，k；
浮动结果；
对于（i=0；i！=width；++i）{
对于（j=0；j！=宽度；++j）{
结果=0；
对于（k=0；k！=width；++k）
结果+=m1[i*宽度+k]*m2[k*宽度+j]；
m3[i*宽度+j]=结果；
}
}
}
__全局无效
内核3（常量浮点*m1，常量浮点*m2，浮点*m3，无符号整数宽度）{
const unsigned int row=blockIdx.y*blockDim.y+threadIdx.y；
const unsigned int col=blockIdx.x*blockDim.x+threadIdx.x；
无符号整数t，i；
浮动结果=0，a，b；
对于（t=0；t如果（scanf（“%d”，&n）！=1 | | n您创建的单位矩阵错误
for (i = 0; i != n*n; ++i) {
        if (i==0 || i == j + (n)) { // not j + (n+1)
            temid[i] = 1;
            j = i;
        }
        else {
            temid[i] = 0;
        }
    }

事实上，您不需要乘以单位矩阵，因为您知道结果始终是输入
改变这个
fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
int w;
for (w=0; w<k; ++w) {
    cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
    kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
    cudaThreadSynchronize();
    cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
    hm[1] = hm[2];
}

fill_matrix_h（hm[0]，SIZE，SIZE，tem，sizeof（tem）/sizeof（float））；
填充矩阵（hm[1]，尺寸，尺寸，temid，尺寸（temid）/sizeof（浮动））；
cudaMemcpy（dm[0]，hm[0]，SIZE*SIZE*sizeof（float），cudaMemcpyHostToDevice）；
int w；
因为（w=0；w我相信我们在这方面是一样的.作为对无限循环的响应，您的块大小/tilesize太小。请将块设置为16，然后重试。虽然D:，但我遇到了seg故障。您好，您能评论一下无限循环的含义吗？就像在每个循环开始处打印W的值一样？您好。嗯，当我在gdb中运行时，它可能不是一个循环，只是说“[新螺纹0x7ffff778b700（LWP 9650）]“”[新螺纹0x7ffff778a700（LWP 9651）]”etcuntil Probabley 64在哪里停了下来Shaha！是的，我想很多人都感受到了我们的痛苦，提交页面上有一片红色。哦，好吧，现在要担心考试了。如果这是家庭作业或课程作业，你真的应该给它贴上这样的标签。关于家庭作业问题在网上是否有效，这里有很多不同的意见谢谢，我以后会记住的
for (i = 0; i != n*n; ++i) {
        if (i==0 || i == j + (n)) { // not j + (n+1)
            temid[i] = 1;
            j = i;
        }
        else {
            temid[i] = 0;
        }
    }

fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
fill_matrix_h(hm[1], SIZE, SIZE, temid, sizeof(temid)/sizeof(float));
cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
int w;
for (w=0; w<k; ++w) {
    cudaMemcpy(dm[1], hm[1], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
    kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
    cudaThreadSynchronize();
    cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
    hm[1] = hm[2];
}

fill_matrix_h(hm[0], SIZE, SIZE, tem, sizeof(tem)/sizeof(float));
cudaMemcpy(dm[0], hm[0], SIZE*SIZE*sizeof(float), cudaMemcpyHostToDevice);
dm[1] = dm[0]; // For the first iteration Result = A * A;
int w;
for (w=0; w<k; ++w) {
   kernel3<<<gdim, bdim>>>(dm[0], dm[1], dm[2], SIZE);
   cudaThreadSynchronize();
   // No need to copy back to host
   // cudaMemcpy(hm[2], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToHost);
   // Copy between device pointers
   cudaMemcpy(dm[1], dm[2], SIZE*SIZE*sizeof(float), cudaMemcpyDeviceToDevice); 
}