Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/c/55.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C 有效地转置大(密集)二元矩阵_C_Matrix_Cuda_Linear Algebra_Transpose - Fatal编程技术网

C 有效地转置大(密集)二元矩阵

C 有效地转置大(密集)二元矩阵,c,matrix,cuda,linear-algebra,transpose,C,Matrix,Cuda,Linear Algebra,Transpose,有一个二进制矩阵(一个由0和1组成的矩阵),我想转置它。矩阵的每一行是一个由32位整数组成的一维数组,整个矩阵是一个由行组成的一维数组 下面是一个128 x 128二进制矩阵的示例,由128行128/3232位整数组成。(实际上,矩阵是一个nxn矩阵,因为N有成千上万个。) //gcc示例0.c-std=c99&&./a.out #包括 #包括 #包括 类型定义uint32\u t uint32; #定义N(32*4)//假设它可以被32整除。该矩阵是一个nxn矩阵 #每行定义整数(N/32)

有一个二进制矩阵(一个由0和1组成的矩阵),我想转置它。矩阵的每一行是一个由32位整数组成的一维数组,整个矩阵是一个由行组成的一维数组

下面是一个
128 x 128
二进制矩阵的示例,由
128
128/32
32位整数组成。(实际上,矩阵是一个
nxn
矩阵,因为
N
有成千上万个。)

//gcc示例0.c-std=c99&&./a.out
#包括
#包括
#包括
类型定义uint32\u t uint32;
#定义N(32*4)//假设它可以被32整除。该矩阵是一个nxn矩阵
#每行定义整数(N/32)
int main(int argc,字符**argv){
uint32**矩阵=malloc(N*sizeof(uint32*);

对于(int i=0;i,这里有一种可能的方法,使用@tera建议的
\u ballot()

  • 以合并的方式,将数据按分片加载到共享内存中
  • 在加载过程中,在共享内存中进行初始转置,将列排列成行,以便于
    \u ballot()
    操作
  • 执行投票,从列数据(已存储在行中)中构建“最终”行数据
  • 把数据写出来
  • 步骤1-3在GPU上应该是相当有效的,至少是我所知道的。步骤4受到以下事实的阻碍:位块的逐位转置不会产生一组内存中相邻的32位量。因此,在一般情况下,全局存储上的内存访问模式是分散的

    输出索引操作是最繁琐的可视化/排列操作

    下面是一个包含4个测试用例的完整示例。对于这些测试用例,我还编写了一个朴素的CPU函数来执行“黄金”测试,借用了下面的问题

    $ cat t435.cu
    #include <stdio.h>
    #include <stdlib.h>
    
    #define IDX(d,x,y,ld) d[y*ld+x]
    
    #define cudaCheckErrors(msg) \
      do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
      } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long dtime_usec(unsigned long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    __global__ void bt(const unsigned * __restrict__ in, unsigned * __restrict__ out, const unsigned idim){
    
    // assumes "square" binary matrix transpose
    // assumes kernel is launched with 1024 threads per block, 2D, 32x32
    // assumes input matrix dimension idim is evenly divisible by 32 bits
    // assumes idim is specified in bits
    
    
      __shared__ unsigned smem[32][33];
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
    
      smem[threadIdx.x][threadIdx.y] = ((idx < idim/32)&&(idy < idim))?IDX(in,idx,idy,idim/32):0;
      __syncthreads();
      unsigned myval = smem[threadIdx.y][31-threadIdx.x];
      __syncthreads();
      smem[ 0][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<31)&myval));
      smem[ 1][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<30)&myval));
      smem[ 2][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<29)&myval));
      smem[ 3][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<28)&myval));
      smem[ 4][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<27)&myval));
      smem[ 5][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<26)&myval));
      smem[ 6][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<25)&myval));
      smem[ 7][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<24)&myval));
      smem[ 8][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<23)&myval));
      smem[ 9][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<22)&myval));
      smem[10][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<21)&myval));
      smem[11][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<20)&myval));
      smem[12][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<19)&myval));
      smem[13][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<18)&myval));
      smem[14][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<17)&myval));
      smem[15][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<16)&myval));
      smem[16][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<15)&myval));
      smem[17][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<14)&myval));
      smem[18][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<13)&myval));
      smem[19][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<12)&myval));
      smem[20][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<11)&myval));
      smem[21][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<10)&myval));
      smem[22][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 9)&myval));
      smem[23][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 8)&myval));
      smem[24][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 7)&myval));
      smem[25][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 6)&myval));
      smem[26][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 5)&myval));
      smem[27][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 4)&myval));
      smem[28][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 3)&myval));
      smem[29][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 2)&myval));
      smem[30][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 1)&myval));
      smem[31][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 0)&myval));
      __syncthreads();
      int indx = (idx*idim)+(gridDim.y*threadIdx.y)+blockIdx.y;
      if ((idx < (idim/32))&&(idy < idim)) out[indx] = smem[threadIdx.y][threadIdx.x];
    }
    
    void naive_cpu_bt(const unsigned *in, unsigned *out, const unsigned idim){
      memset(out, 0, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < (idim/32); i++)
        for (int j = 0; j < idim; j++){
          for (int bit = 0; bit < 32; bit++){
            if ((in[(j*(idim/32)) + i]>>bit)&1) out[(((i*32)+(31-bit))*(idim/32))+(j/32)] |=  1<<(31 - (j%32));
            }
          }
    }
    
    int main(){
    
    
      unsigned *h_idata, *h_odata, *d_idata, *d_odata, *c_odata;
      unsigned idim;
    // test case 1, 32x32, upper triangular
      idim = 32;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      unsigned data = 0x0FFFFFFFFU;
      for (int i = 0; i < 32; i++){
        h_idata[i] = data;
        data>>=1;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,1),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) {printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 32; i++) printf("0x%8x\n", h_odata[i]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 2, 64x64, opposite diagonal
      idim = 64;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x01;
      for (int i = 0; i < 32; i++){
        h_idata[2*i] = 0; h_idata[(2*i)+1] = data;
        h_idata[64+(2*i)] = data; h_idata[65+(2*i)] = 0xFFFFFFFFU;
        data<<=1; data++;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,2),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 64; i++) printf("0x%8x 0x%8x\n", h_odata[i*2], h_odata[(i*2)+1]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 3, 96x96, ones in alternating columns
      idim = 96;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x55555555U;
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = data;
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,3),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 96; i++) printf("0x%8x 0x%8x 0x%8x\n", h_odata[i*3], h_odata[(i*3)+1], h_odata[(i*3)+2]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 4, 8kx8k random
      idim = 8192;
      int xblocks = (idim/1024)+((idim%1024)?1:0);
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = rand();
      unsigned long gt = dtime_usec(0);
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      unsigned long gkt = dtime_usec(0);
      bt<<<dim3(xblocks,(idim/32)),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaDeviceSynchronize();
      gkt = dtime_usec(gkt);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      gt = dtime_usec(gt);
      unsigned long ct = dtime_usec(0);
      naive_cpu_bt(h_idata, c_odata, idim);
      ct = dtime_usec(ct);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      printf("gputime: %fms, kerneltime: %fms, cputime: %fms\n", (gt*1000)/(float)USECPSEC, (gkt*1000)/(float)USECPSEC, (ct*1000)/(float)USECPSEC);
      printf("kernel bandwidth = %fMB/s\n", (idim*(idim/32)*4*2)/(float)gkt);
      printf("Success!\n");
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    
      return 0;
    }
    $ nvcc -arch=sm_61 -o t435 t435.cu
    $ ./t435
    0x80000000
    0xc0000000
    0xe0000000
    0xf0000000
    0xf8000000
    0xfc000000
    0xfe000000
    0xff000000
    0xff800000
    0xffc00000
    0xffe00000
    0xfff00000
    0xfff80000
    0xfffc0000
    0xfffe0000
    0xffff0000
    0xffff8000
    0xffffc000
    0xffffe000
    0xfffff000
    0xfffff800
    0xfffffc00
    0xfffffe00
    0xffffff00
    0xffffff80
    0xffffffc0
    0xffffffe0
    0xfffffff0
    0xfffffff8
    0xfffffffc
    0xfffffffe
    0xffffffff
    0x       0 0x       1
    0x       0 0x       3
    0x       0 0x       7
    0x       0 0x       f
    0x       0 0x      1f
    0x       0 0x      3f
    0x       0 0x      7f
    0x       0 0x      ff
    0x       0 0x     1ff
    0x       0 0x     3ff
    0x       0 0x     7ff
    0x       0 0x     fff
    0x       0 0x    1fff
    0x       0 0x    3fff
    0x       0 0x    7fff
    0x       0 0x    ffff
    0x       0 0x   1ffff
    0x       0 0x   3ffff
    0x       0 0x   7ffff
    0x       0 0x   fffff
    0x       0 0x  1fffff
    0x       0 0x  3fffff
    0x       0 0x  7fffff
    0x       0 0x  ffffff
    0x       0 0x 1ffffff
    0x       0 0x 3ffffff
    0x       0 0x 7ffffff
    0x       0 0x fffffff
    0x       0 0x1fffffff
    0x       0 0x3fffffff
    0x       0 0x7fffffff
    0x       0 0xffffffff
    0x       1 0xffffffff
    0x       3 0xffffffff
    0x       7 0xffffffff
    0x       f 0xffffffff
    0x      1f 0xffffffff
    0x      3f 0xffffffff
    0x      7f 0xffffffff
    0x      ff 0xffffffff
    0x     1ff 0xffffffff
    0x     3ff 0xffffffff
    0x     7ff 0xffffffff
    0x     fff 0xffffffff
    0x    1fff 0xffffffff
    0x    3fff 0xffffffff
    0x    7fff 0xffffffff
    0x    ffff 0xffffffff
    0x   1ffff 0xffffffff
    0x   3ffff 0xffffffff
    0x   7ffff 0xffffffff
    0x   fffff 0xffffffff
    0x  1fffff 0xffffffff
    0x  3fffff 0xffffffff
    0x  7fffff 0xffffffff
    0x  ffffff 0xffffffff
    0x 1ffffff 0xffffffff
    0x 3ffffff 0xffffffff
    0x 7ffffff 0xffffffff
    0x fffffff 0xffffffff
    0x1fffffff 0xffffffff
    0x3fffffff 0xffffffff
    0x7fffffff 0xffffffff
    0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    gputime: 5.530000ms, kerneltime: 0.301000ms, cputime: 659.205994ms
    kernel bandwidth = 55738.257812MB/s
    Success!
    $
    
    $cat t435.cu
    #包括
    #包括
    #定义IDX(d,x,y,ld)d[y*ld+x]
    #定义cudaCheckErrors(msg)\
    做{\
    cudaError\u t\u err=cudaGetLastError()\
    如果(_err!=cudaSuccess){\
    fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
    msg,cudaGetErrorString(_err)\
    __文件(行)\
    fprintf(stderr,“***失败-中止\n”)\
    出口(1)\
    } \
    }而(0)
    #包括
    #包括
    #定义USECPSEC 10000000ull
    无符号长dtime\u usec(无符号长启动){
    蒂梅瓦尔电视;
    gettimeofday(&tv,0);
    返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
    }
    __全局无效bt(常量无符号*\uuuu限制\uuuuu输入,无符号*\uu限制\uuuu输出,常量无符号idim){
    //假设“平方”二进制矩阵转置
    //假设内核启动时每个块有1024个线程,2D,32x32
    //假设输入矩阵维数idim可被32位整除
    //假设idim是以位为单位指定的
    __共享的_uuu未签名的smem[32][33];
    int idx=threadIdx.x+blockDim.x*blockIdx.x;
    int-idy=threadIdx.y+blockDim.y*blockIdx.y;
    smem[threadIdx.x][threadIdx.y]=((idxsmem[0][threadIdx.y]=\uuuuuuu ballot\u sync(0xFFFFFFFFU,((1U这里是一种可能的方法,使用@tera建议的
    \uuuuu ballot()

  • 以合并的方式,将数据按分片加载到共享内存中
  • 在加载过程中,在共享内存中进行初始转置,将列排列成行,以便于
    \u ballot()
    操作
  • 执行投票,从列数据(已存储在行中)中构建“最终”行数据
  • 把数据写出来
  • 步骤1-3在GPU上应该是相当有效的,至少是我所知道的。步骤4受到以下事实的阻碍:位块的逐位转置不会产生一组内存中相邻的32位量。因此,在一般情况下,全局存储上的内存访问模式是分散的

    输出索引操作是最繁琐的可视化/排列操作

    下面是一个包含4个测试用例的完整示例。对于这些测试用例,我还编写了一个朴素的CPU函数来执行“黄金”测试,借用了下面的问题

    $ cat t435.cu
    #include <stdio.h>
    #include <stdlib.h>
    
    #define IDX(d,x,y,ld) d[y*ld+x]
    
    #define cudaCheckErrors(msg) \
      do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
      } while (0)
    
    
    #include <time.h>
    #include <sys/time.h>
    #define USECPSEC 1000000ULL
    
    unsigned long dtime_usec(unsigned long start){
    
      timeval tv;
      gettimeofday(&tv, 0);
      return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
    }
    __global__ void bt(const unsigned * __restrict__ in, unsigned * __restrict__ out, const unsigned idim){
    
    // assumes "square" binary matrix transpose
    // assumes kernel is launched with 1024 threads per block, 2D, 32x32
    // assumes input matrix dimension idim is evenly divisible by 32 bits
    // assumes idim is specified in bits
    
    
      __shared__ unsigned smem[32][33];
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      int idy = threadIdx.y+blockDim.y*blockIdx.y;
    
      smem[threadIdx.x][threadIdx.y] = ((idx < idim/32)&&(idy < idim))?IDX(in,idx,idy,idim/32):0;
      __syncthreads();
      unsigned myval = smem[threadIdx.y][31-threadIdx.x];
      __syncthreads();
      smem[ 0][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<31)&myval));
      smem[ 1][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<30)&myval));
      smem[ 2][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<29)&myval));
      smem[ 3][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<28)&myval));
      smem[ 4][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<27)&myval));
      smem[ 5][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<26)&myval));
      smem[ 6][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<25)&myval));
      smem[ 7][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<24)&myval));
      smem[ 8][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<23)&myval));
      smem[ 9][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<22)&myval));
      smem[10][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<21)&myval));
      smem[11][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<20)&myval));
      smem[12][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<19)&myval));
      smem[13][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<18)&myval));
      smem[14][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<17)&myval));
      smem[15][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<16)&myval));
      smem[16][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<15)&myval));
      smem[17][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<14)&myval));
      smem[18][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<13)&myval));
      smem[19][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<12)&myval));
      smem[20][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<11)&myval));
      smem[21][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<<10)&myval));
      smem[22][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 9)&myval));
      smem[23][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 8)&myval));
      smem[24][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 7)&myval));
      smem[25][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 6)&myval));
      smem[26][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 5)&myval));
      smem[27][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 4)&myval));
      smem[28][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 3)&myval));
      smem[29][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 2)&myval));
      smem[30][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 1)&myval));
      smem[31][threadIdx.y] = __ballot_sync(0xFFFFFFFFU, ((1U<< 0)&myval));
      __syncthreads();
      int indx = (idx*idim)+(gridDim.y*threadIdx.y)+blockIdx.y;
      if ((idx < (idim/32))&&(idy < idim)) out[indx] = smem[threadIdx.y][threadIdx.x];
    }
    
    void naive_cpu_bt(const unsigned *in, unsigned *out, const unsigned idim){
      memset(out, 0, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < (idim/32); i++)
        for (int j = 0; j < idim; j++){
          for (int bit = 0; bit < 32; bit++){
            if ((in[(j*(idim/32)) + i]>>bit)&1) out[(((i*32)+(31-bit))*(idim/32))+(j/32)] |=  1<<(31 - (j%32));
            }
          }
    }
    
    int main(){
    
    
      unsigned *h_idata, *h_odata, *d_idata, *d_odata, *c_odata;
      unsigned idim;
    // test case 1, 32x32, upper triangular
      idim = 32;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      unsigned data = 0x0FFFFFFFFU;
      for (int i = 0; i < 32; i++){
        h_idata[i] = data;
        data>>=1;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,1),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) {printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 32; i++) printf("0x%8x\n", h_odata[i]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 2, 64x64, opposite diagonal
      idim = 64;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x01;
      for (int i = 0; i < 32; i++){
        h_idata[2*i] = 0; h_idata[(2*i)+1] = data;
        h_idata[64+(2*i)] = data; h_idata[65+(2*i)] = 0xFFFFFFFFU;
        data<<=1; data++;}
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,2),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 64; i++) printf("0x%8x 0x%8x\n", h_odata[i*2], h_odata[(i*2)+1]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 3, 96x96, ones in alternating columns
      idim = 96;
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      data = 0x55555555U;
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = data;
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      bt<<<dim3(1,3),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      naive_cpu_bt(h_idata, c_odata, idim);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      for (int i = 0; i < 96; i++) printf("0x%8x 0x%8x 0x%8x\n", h_odata[i*3], h_odata[(i*3)+1], h_odata[(i*3)+2]);
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    // test case 4, 8kx8k random
      idim = 8192;
      int xblocks = (idim/1024)+((idim%1024)?1:0);
      h_idata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      h_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      c_odata = (unsigned *)malloc(idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_idata, idim*(idim/32)*sizeof(unsigned));
      cudaMalloc(&d_odata, idim*(idim/32)*sizeof(unsigned));
      for (int i = 0; i < idim*(idim/32); i++)
        h_idata[i] = rand();
      unsigned long gt = dtime_usec(0);
      cudaMemcpy(d_idata, h_idata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyHostToDevice);
      unsigned long gkt = dtime_usec(0);
      bt<<<dim3(xblocks,(idim/32)),dim3(32,32)>>>(d_idata, d_odata, idim);
      cudaDeviceSynchronize();
      gkt = dtime_usec(gkt);
      cudaMemcpy(h_odata, d_odata, idim*(idim/32)*sizeof(unsigned), cudaMemcpyDeviceToHost);
      gt = dtime_usec(gt);
      unsigned long ct = dtime_usec(0);
      naive_cpu_bt(h_idata, c_odata, idim);
      ct = dtime_usec(ct);
      for (int i = 0; i < (idim/32)*idim; i++) if (c_odata[i] != h_odata[i]) { printf("mismatch at %u, was: %u, should be: %u\n", i, h_odata[i], c_odata[i]); return -1;}
      printf("gputime: %fms, kerneltime: %fms, cputime: %fms\n", (gt*1000)/(float)USECPSEC, (gkt*1000)/(float)USECPSEC, (ct*1000)/(float)USECPSEC);
      printf("kernel bandwidth = %fMB/s\n", (idim*(idim/32)*4*2)/(float)gkt);
      printf("Success!\n");
      free(h_idata);
      free(h_odata);
      free(c_odata);
      cudaFree(d_idata);
      cudaFree(d_odata);
    
      return 0;
    }
    $ nvcc -arch=sm_61 -o t435 t435.cu
    $ ./t435
    0x80000000
    0xc0000000
    0xe0000000
    0xf0000000
    0xf8000000
    0xfc000000
    0xfe000000
    0xff000000
    0xff800000
    0xffc00000
    0xffe00000
    0xfff00000
    0xfff80000
    0xfffc0000
    0xfffe0000
    0xffff0000
    0xffff8000
    0xffffc000
    0xffffe000
    0xfffff000
    0xfffff800
    0xfffffc00
    0xfffffe00
    0xffffff00
    0xffffff80
    0xffffffc0
    0xffffffe0
    0xfffffff0
    0xfffffff8
    0xfffffffc
    0xfffffffe
    0xffffffff
    0x       0 0x       1
    0x       0 0x       3
    0x       0 0x       7
    0x       0 0x       f
    0x       0 0x      1f
    0x       0 0x      3f
    0x       0 0x      7f
    0x       0 0x      ff
    0x       0 0x     1ff
    0x       0 0x     3ff
    0x       0 0x     7ff
    0x       0 0x     fff
    0x       0 0x    1fff
    0x       0 0x    3fff
    0x       0 0x    7fff
    0x       0 0x    ffff
    0x       0 0x   1ffff
    0x       0 0x   3ffff
    0x       0 0x   7ffff
    0x       0 0x   fffff
    0x       0 0x  1fffff
    0x       0 0x  3fffff
    0x       0 0x  7fffff
    0x       0 0x  ffffff
    0x       0 0x 1ffffff
    0x       0 0x 3ffffff
    0x       0 0x 7ffffff
    0x       0 0x fffffff
    0x       0 0x1fffffff
    0x       0 0x3fffffff
    0x       0 0x7fffffff
    0x       0 0xffffffff
    0x       1 0xffffffff
    0x       3 0xffffffff
    0x       7 0xffffffff
    0x       f 0xffffffff
    0x      1f 0xffffffff
    0x      3f 0xffffffff
    0x      7f 0xffffffff
    0x      ff 0xffffffff
    0x     1ff 0xffffffff
    0x     3ff 0xffffffff
    0x     7ff 0xffffffff
    0x     fff 0xffffffff
    0x    1fff 0xffffffff
    0x    3fff 0xffffffff
    0x    7fff 0xffffffff
    0x    ffff 0xffffffff
    0x   1ffff 0xffffffff
    0x   3ffff 0xffffffff
    0x   7ffff 0xffffffff
    0x   fffff 0xffffffff
    0x  1fffff 0xffffffff
    0x  3fffff 0xffffffff
    0x  7fffff 0xffffffff
    0x  ffffff 0xffffffff
    0x 1ffffff 0xffffffff
    0x 3ffffff 0xffffffff
    0x 7ffffff 0xffffffff
    0x fffffff 0xffffffff
    0x1fffffff 0xffffffff
    0x3fffffff 0xffffffff
    0x7fffffff 0xffffffff
    0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    0x       0 0x       0 0x       0
    0xffffffff 0xffffffff 0xffffffff
    gputime: 5.530000ms, kerneltime: 0.301000ms, cputime: 659.205994ms
    kernel bandwidth = 55738.257812MB/s
    Success!
    $
    
    $cat t435.cu
    #包括
    #包括
    #定义IDX(d,x,y,ld)d[y*ld+x]
    #定义cudaCheckErrors(msg)\
    做{\
    cudaError\u t\u err=cudaGetLastError()\
    如果(_err!=cudaSuccess){\
    fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
    msg,cudaGetErrorString(_err)\
    __文件(行)\
    fprintf(stderr,“***失败-中止\n”)\
    出口(1)\
    } \
    }而(0)
    #包括
    #包括
    #定义USECPSEC 10000000ull
    无符号长dtime\u usec(无符号长启动){
    蒂梅瓦尔电视;
    gettimeofday(&tv,0);
    返回((tv.tv_sec*USECPSEC)+tv.tv_usec)-开始;
    }
    __全局无效bt(常量无符号*\uuuu限制\uuuuu输入,无符号*\uu限制\uuuu输出,常量无符号idim){
    //假设“平方”二进制矩阵转置
    //假设内核启动时每个块有1024个线程,2D,32x32
    //假设输入矩阵维数idim可被32位整除
    //假设idim是以位为单位指定的
    __共享的_uuu未签名的smem[32][33];
    int idx=threadIdx.x+blockDim.x*blockIdx.x;
    int-idy=threadIdx.y+blockDim.y*blockIdx.y;
    smem[threadIdx.x][threadIdx.y]=((idxsmem[0][threadIdx.y]=\uuuuu选票\uu同步(0xFFFFFFFFU((1如果转置是针对正方形矩阵,那么N*N是可以的,但是对于一般的
    N*M
    矩阵,最好假设
    N!=M
    ,并在它们不同的地方使用示例-它们更容易理解。您的示例是
    N*N
    ,但这是针对正方形矩阵还是矩形矩阵?您展示的代码没有转置任何东西.不确定你是把它作为转置的例子还是矩阵设置的例子。好