对CUDA中矩阵的行（按行主顺序或列主顺序存储）求和_Cuda

对CUDA中矩阵的行（按行主顺序或列主顺序存储）求和

cuda

对CUDA中矩阵的行（按行主顺序或列主顺序存储）求和,cuda,Cuda,我正在研究CUDA中矩阵行的求和问题。我举下面的例子假设有以下20*4数组： 1 2 3 4 4 1 2 3 3 4 1 2 . 1 2 3 4 . . . . . . . . 2 1 3 4 将2d数组展平为1d数组（以行主顺序或列主顺序）后，我需要将每个线程分配到不同的行，并计算该行的成本例如 -线程1应计算1234 -线程2应该计算4 1 2 3 在CUDA我怎么能做到谢谢大家的回复#包括 #include <stdio.h> #include <stdlib.

我正在研究CUDA中矩阵行的求和问题。我举下面的例子

假设有以下

20*4

数组：

将2d数组展平为1d数组（以行主顺序或列主顺序）后，我需要将每个线程分配到不同的行，并计算该行的成本

例如
-线程1应计算

-线程2应该计算

4 1 2 3

在CUDA我怎么能做到

谢谢大家的回复

#包括
#include <stdio.h>
#include <stdlib.h>
#define MROWS 20
#define NCOLS 4
#define nTPB 256

__global__ void mykernel(int *costdata, int rows, int cols, int *results){
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    int mycost = 0;
    for (int i = 0; i < cols; i++)
       mycost += costdata[(tidx*cols)+i];
    results[tidx] = mycost;
    }
  }

int main(){
  //define and initialize host and device storage for cost and results
  int *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (int *)malloc(MROWS*sizeof(int));
  h_costdata = (int *)malloc(MROWS*NCOLS*sizeof(int));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = rand()%4;
  cudaMalloc((void **)&d_results, MROWS*sizeof(int));
  cudaMalloc((void **)&d_costdata, MROWS*NCOLS*sizeof(int));
  //copy cost data from host to device
  cudaMemcpy(d_costdata, h_costdata, MROWS*NCOLS*sizeof(int), cudaMemcpyHostToDevice);
  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    int loc_cost = 0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    printf("cost[%d]: host= %d, device = %d\n", i, loc_cost, h_results[i]);
    }
  }

#包括
#定义MROWS 20
#定义NCOL4
#定义nTPB 256
__全局无效mykernel（int*costdata、int行、int列、int*results）{
int tidx=threadIdx.x+blockDim.x*blockIdx.x；
如果（tidx<行）{
int-mycost=0；
for（int i=0；i#包括
#包括
#定义MROWS 20
#定义NCOL4
#定义nTPB 256
__全局无效mykernel（int*costdata、int行、int列、int*results）{
int tidx=threadIdx.x+blockDim.x*blockIdx.x；
如果（tidx<行）{
int-mycost=0；
for（int i=0；i对于（int i=0；i）是否按行主顺序或列主顺序存储展平数组？成本是多少计算？是求和吗？
行主顺序扁平化数组
成本计算是两个其他扁平化2d数组的乘积部门和位置之间的距离和流量。其他方法请参见此处。您是否按行主顺序或列主顺序存储扁平化数组？什么是“成本”计算？它是一个总和吗？
行主订单flateed数组
成本计算是两个其他扁平2d数组的乘积部门和地点之间的距离和流量。其他方法请参见此处。我想我看到了代码的“第一”版本（在您发布后，您仍在5分钟的编辑窗口内）。内核调用缺失，内核本身看起来也与现在大不相同。对于cols=4
的特殊情况，您可以使用int4
加载来代替单个int
加载并释放循环。内存吞吐量也应该大大提高。经过一些修改，我可以使用16字节的加载来int
，float
，或double
，用于任意数量的列。在cols=4
的特殊情况下（对于int
或float
，或cols=2
对于double
）它将提供完美的合并。在其他情况下，它可能会稍微提高效率。在这种方法中，我仍然需要一个循环来处理任意数量的列。或者我可以只按列的主要顺序存储东西，享受100%的合并任意数量的列和代码的简单性。@RobertCrovella极好的回答。我只是好奇。您使用的是1d
网格。当计算能力=3.5
可用时，是否可以使用2d
网格，并利用洗牌操作来总结元素？您认为这样做有什么好处吗？我想我看到了代码的“第一个”版本（当你还在5分钟的编辑窗口内发布后）。内核调用缺失，内核本身看起来也与现在大不相同。对于cols=4
的特殊情况，您可以使用int4
加载来代替单个int
加载并释放循环。内存吞吐量也应该大大提高。经过一些修改，我可以使用16字节的加载来int
，float
，或double
，用于任意数量的列。在cols=4
的特殊情况下（对于int
或float
，或cols=2
对于double
）它将提供完美的合并。在其他情况下，它可能会稍微提高效率。在这种方法中，我仍然需要一个循环来处理任意数量的列。或者我可以只按列的主要顺序存储东西，享受100%的合并任意数量的列和代码的简单性。@RobertCrovella极好的回答。我只是好奇。您正在使用1d
网格。当计算能力>=3.5
可用时，是否可以使用2d网格，并利用无序操作来汇总元素？您认为这样做有什么好处吗？
for (int i = 0; i < cols; i++)
  mycost += costdata[(i*rows)+tidx];

#include <iostream>
#include <typeinfo>
#include <cstdlib>
#include <vector_types.h>

#define MROWS 1742
#define NCOLS 801
#define nTPB 256

typedef double mytype;

__host__ int sizetype(){
  int size = 0;
  if ((typeid(mytype) == typeid(float)) || (typeid(mytype) == typeid(int)) || (typeid(mytype) == typeid(unsigned int)))
      size = 4;
  else if (typeid(mytype) == typeid(double))
      size = 8;
  else if ((typeid(mytype) == typeid(unsigned char)) || (typeid(mytype) == typeid(char)))
      size = 1;
  return size;
  }


template<typename T>
__global__ void mykernel(const T *costdata, int rows, int cols, T *results, int size, size_t pitch){
  int chunk = 16/size;  // assumes size is a factor of 16
  int tidx = threadIdx.x + blockDim.x*blockIdx.x;
  if (tidx < rows){
    T *myrowptr = (T *)(((unsigned char *)costdata) + tidx*pitch);
    T mycost = (T)0;
    int count = 0;
    while (count < cols){
      if ((cols-count)>=chunk){
      // read 16 bytes
        int4 temp = *((int4 *)(myrowptr + count));
        int bcount = 16;
        int j = 0;
        while (bcount > 0){
          mycost += *(((T *)(&temp)) + j++);
          bcount -= size;
          count++;}
        }
      else {
      // read one quantity at a time
        for (; count < cols; count++)
          mycost += myrowptr[count];
        }
    results[tidx] = mycost;
    }
  }
}

int main(){
  int typesize = sizetype();
  if (typesize == 0) {std::cout << "invalid type selected" << std::endl; return 1;}
  //define and initialize host and device storage for cost and results
  mytype *d_costdata, *h_costdata, *d_results, *h_results;
  h_results = (mytype *)malloc(MROWS*sizeof(mytype));
  h_costdata = (mytype *)malloc(MROWS*NCOLS*sizeof(mytype));
  for (int i=0; i<(MROWS*NCOLS); i++)
    h_costdata[i] = (mytype)(rand()%4);
  size_t pitch = 0;
  cudaMalloc((void **)&d_results, MROWS*sizeof(mytype));
  cudaMallocPitch((void **)&d_costdata, &pitch, NCOLS*sizeof(mytype), MROWS);
  //copy cost data from host to device
  cudaMemcpy2D(d_costdata, pitch, h_costdata, NCOLS*sizeof(mytype), NCOLS*sizeof(mytype),  MROWS, cudaMemcpyHostToDevice);

  mykernel<<<(MROWS + nTPB - 1)/nTPB, nTPB>>>(d_costdata, MROWS, NCOLS, d_results, typesize, pitch);
  // copy results back from device to host
  cudaMemcpy(h_results, d_results, MROWS*sizeof(mytype), cudaMemcpyDeviceToHost);
  for (int i=0; i<MROWS; i++){
    mytype loc_cost = (mytype)0;
    for (int j=0; j<NCOLS; j++) loc_cost += h_costdata[(i*NCOLS)+j];
    if ((i < 10) && (typesize > 1))
      std::cout <<"cost[" << i << "]: host= " << loc_cost << ", device = " << h_results[i] << std::endl;
    if (loc_cost != h_results[i]){ std::cout << "mismatch at index" << i << "should be:" << loc_cost << "was:" << h_results[i] << std::endl; return 1; }
    }
  std::cout << "Results are correct!" << std::endl;
  }