Cuda 有没有一种方法可以在奇怪大小的数据阵列上使用CUB::BlockScan?

Cuda 有没有一种方法可以在奇怪大小的数据阵列上使用CUB::BlockScan?,cuda,cub,Cuda,Cub,所有示例都在大小为32的倍数的阵列上执行扫描。最快的示例使用256个或更多线程,每个线程分配4个或更多元素 这意味着,如果我有一个大小为450的数组,那么我可能必须将其填充到512,并执行256个线程,每个线程分配2个元素 但是,在我的特定实例中,必须填充每个数组是不可行的 是否有另一种解决方案来处理多个大小奇怪的阵列?有没有办法指定宽度 好的,让我们更清楚一点。这是一个简化的例子。假设我有2个数组,其中一个数组只是第二个数组中的整数偏移量列表,其中包含数据。偏移量表示一组单独数据的开始 每

所有示例都在大小为32的倍数的阵列上执行扫描。最快的示例使用256个或更多线程,每个线程分配4个或更多元素

这意味着,如果我有一个大小为450的数组,那么我可能必须将其填充到512,并执行256个线程,每个线程分配2个元素

但是,在我的特定实例中,必须填充每个数组是不可行的

是否有另一种解决方案来处理多个大小奇怪的阵列?有没有办法指定宽度


好的,让我们更清楚一点。这是一个简化的例子。假设我有2个数组,其中一个数组只是第二个数组中的整数偏移量列表,其中包含数据。偏移量表示一组单独数据的开始

每组数据的大小是随机的。我从其他进程中获取数据作为块,因此没有简单的方法填充它们。我想对来自同一内核的每个偏移量运行BlockScan。

让您的索引(偏移量)数组为idx[]。将数据数组设为A[],将扫描结果设为B[]

  • 扫描整个阵列A[],将输出存储在B[]中

  • 对于idx[i]处的每个元素,转到B[]中的索引-1,检索该值,然后使用idx[i-1]处的元素在B[]中索引-1并减去该值,然后从A[]中相同的索引idx[i](而非-1)中减去结果

  • 重新扫描A到B

  • 举个简单的例子:

    idx: 0 2 5
    
    0:  1  1  1  1  1  1  1  1
    1:  1  2  3  4  5  6  7  8
    2:  1  1 -1  1  1 -2  1  1
    3:  1  2  1  2  3  1  2  3
    
    在上述示例中,步骤2中的-1被计算为步骤1中索引(2-1)处的扫描值减去步骤1中索引(0-1)(假设为零)处的扫描值,然后从原始数据值中减去该扫描值。步骤2中的-2计算为步骤1中索引(5-1)处的扫描值减去步骤1中索引(2-1)处的扫描值,然后从原始数据值中减去

    以下是一个例子:

    $ cat t453.cu
    #include <cub/cub.cuh>
    #include <iostream>
    
    template <int TPB, int IPT, typename T>
    __global__ void k(T *data, int *idx, int n){
    
        // Specialize BlockScan for a 1D block of TPB threads on type T
        __shared__ T sdata[TPB*IPT*2];
        sdata[threadIdx.x*IPT] = 1;
        __syncthreads();
        typedef cub::BlockScan<T, TPB> BlockScan;
        // Allocate shared memory for BlockScan
        __shared__ typename BlockScan::TempStorage temp_storage;
        // Obtain a segment of consecutive items that are blocked across threads
        int thread_data[IPT];
        thread_data[0] = sdata[threadIdx.x*IPT];
        // Collectively compute the block-wide exclusive prefix sum
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        sdata[IPT*(threadIdx.x+TPB)] = thread_data[0];
        if ((threadIdx.x < n) && (threadIdx.x > 0)) // assume the first element if idx points to 0
          sdata[idx[threadIdx.x]*IPT] -= (sdata[((idx[threadIdx.x]-1)+TPB)*IPT] - ((threadIdx.x == 1)?0:sdata[((idx[threadIdx.x-1]-1)+TPB)*IPT]));
        __syncthreads();
        thread_data[0] = sdata[threadIdx.x*IPT];
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        data[threadIdx.x] = thread_data[0];
    }
    
    typedef int dtype;
    const int nTPB = 256;
    
    int main(){
      int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200};
      int n = sizeof(h_idx)/sizeof(h_idx[0]);
      std::cout << "n = " << n << std::endl;
      int *d_idx;
      cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
      cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
      dtype *h_data, *d_data;
      h_data = new dtype[nTPB];
      cudaMalloc(&d_data, nTPB*sizeof(dtype));
      k<nTPB, 1><<<1,nTPB>>>(d_data, d_idx, n);
      cudaMemcpy(h_data, d_data, nTPB*sizeof(dtype), cudaMemcpyDeviceToHost);
      dtype sum;
      int idx = 0;
      for (int i = 0; i < nTPB; i++){
        if (i == h_idx[idx]) {sum = 0; idx++;}
        sum++;
        std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
      }
    }
    $ nvcc -o t453 t453.cu
    $ cuda-memcheck ./t453
    ========= CUDA-MEMCHECK
    n = 8
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    gpu: 57 cpu: 57
    gpu: 58 cpu: 58
    gpu: 59 cpu: 59
    gpu: 60 cpu: 60
    gpu: 61 cpu: 61
    gpu: 62 cpu: 62
    gpu: 63 cpu: 63
    gpu: 64 cpu: 64
    gpu: 65 cpu: 65
    gpu: 66 cpu: 66
    gpu: 67 cpu: 67
    gpu: 68 cpu: 68
    gpu: 69 cpu: 69
    gpu: 70 cpu: 70
    gpu: 71 cpu: 71
    gpu: 72 cpu: 72
    gpu: 73 cpu: 73
    gpu: 74 cpu: 74
    gpu: 75 cpu: 75
    gpu: 76 cpu: 76
    gpu: 77 cpu: 77
    gpu: 78 cpu: 78
    gpu: 79 cpu: 79
    gpu: 80 cpu: 80
    gpu: 81 cpu: 81
    gpu: 82 cpu: 82
    gpu: 83 cpu: 83
    gpu: 84 cpu: 84
    gpu: 85 cpu: 85
    gpu: 86 cpu: 86
    gpu: 87 cpu: 87
    gpu: 88 cpu: 88
    gpu: 89 cpu: 89
    gpu: 90 cpu: 90
    gpu: 91 cpu: 91
    gpu: 92 cpu: 92
    gpu: 93 cpu: 93
    gpu: 94 cpu: 94
    gpu: 95 cpu: 95
    gpu: 96 cpu: 96
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    ========= ERROR SUMMARY: 0 errors
    $
    
    $ cat t455.cu
    #include <cub/cub.cuh>
    #include <iostream>
    
    template <int TPB, int IPT, typename T>
    __global__ void k(T *data, int *idx){
        int lidx = threadIdx.x;
        // Specialize BlockScan for a 1D block of TPB threads on type T
        typedef cub::BlockScan<T, TPB> BlockScan;
        // Allocate shared memory for BlockScan
        __shared__ typename BlockScan::TempStorage temp_storage;
        // Obtain a segment of consecutive items that are blocked across threads
        int thread_data[IPT];
        thread_data[0] = ((lidx+idx[blockIdx.x])>=idx[blockIdx.x+1])?0:data[lidx+idx[blockIdx.x]];
        // Collectively compute the block-wide inclusive prefix sum
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        if ((lidx+idx[blockIdx.x]) < idx[blockIdx.x+1])
          data[lidx+idx[blockIdx.x]] = thread_data[0];
    }
    
    typedef int dtype;
    const int nTPB = 128; // sized with IPT to handle the largest segment
    const int DS = 256;
    int main(){
      int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200, 256};
      int n = sizeof(h_idx)/sizeof(h_idx[0]);
      std::cout << "n = " << n << std::endl;
      int *d_idx;
      cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
      cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
      dtype *h_data, *d_data;
      h_data = new dtype[DS];
      for (int i = 0; i < DS; i++) h_data[i] = 1;
      cudaMalloc(&d_data, DS*sizeof(dtype));
      cudaMemcpy(d_data, h_data, DS*sizeof(h_data[0]), cudaMemcpyHostToDevice);
      k<nTPB, 1><<<n-1,nTPB>>>(d_data, d_idx);
      cudaMemcpy(h_data, d_data, DS*sizeof(dtype), cudaMemcpyDeviceToHost);
      dtype sum;
      int idx = 0;
      for (int i = 0; i < DS; i++){
        if (i == h_idx[idx]) {sum = 0; idx++;}
        sum++;
        std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
      }
    }
    $ nvcc -o t455 t455.cu
    $ cuda-memcheck ./t455
    ========= CUDA-MEMCHECK
    n = 9
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    gpu: 57 cpu: 57
    gpu: 58 cpu: 58
    gpu: 59 cpu: 59
    gpu: 60 cpu: 60
    gpu: 61 cpu: 61
    gpu: 62 cpu: 62
    gpu: 63 cpu: 63
    gpu: 64 cpu: 64
    gpu: 65 cpu: 65
    gpu: 66 cpu: 66
    gpu: 67 cpu: 67
    gpu: 68 cpu: 68
    gpu: 69 cpu: 69
    gpu: 70 cpu: 70
    gpu: 71 cpu: 71
    gpu: 72 cpu: 72
    gpu: 73 cpu: 73
    gpu: 74 cpu: 74
    gpu: 75 cpu: 75
    gpu: 76 cpu: 76
    gpu: 77 cpu: 77
    gpu: 78 cpu: 78
    gpu: 79 cpu: 79
    gpu: 80 cpu: 80
    gpu: 81 cpu: 81
    gpu: 82 cpu: 82
    gpu: 83 cpu: 83
    gpu: 84 cpu: 84
    gpu: 85 cpu: 85
    gpu: 86 cpu: 86
    gpu: 87 cpu: 87
    gpu: 88 cpu: 88
    gpu: 89 cpu: 89
    gpu: 90 cpu: 90
    gpu: 91 cpu: 91
    gpu: 92 cpu: 92
    gpu: 93 cpu: 93
    gpu: 94 cpu: 94
    gpu: 95 cpu: 95
    gpu: 96 cpu: 96
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    ========= ERROR SUMMARY: 0 errors
    $
    
    $cat t453.cu
    #包括
    #包括
    模板
    __全局无效k(T*数据,int*idx,int n){
    //专门化T型上TPB线程的1D块的块扫描
    __共享sdata[TPB*IPT*2];
    sdata[threadIdx.x*IPT]=1;
    __同步线程();
    typedef cub::BlockScan BlockScan;
    //为块扫描分配共享内存
    __共享类型名称块扫描::临时存储临时存储;
    //获取跨线程阻塞的连续项目段
    int线程_数据[IPT];
    线程_数据[0]=sdata[threadIdx.x*IPT];
    //集体计算块范围的独占前缀和
    块扫描(临时存储)。包括ESUM(线程数据、线程数据);
    __同步线程();
    sdata[IPT*(threadIdx.x+TPB)]=线程数据[0];
    if((threadIdx.x0))//如果idx指向0,则假定第一个元素
    sdata[idx[threadIdx.x]*IPT]=(sdata[(idx[threadIdx.x]-1)+TPB)*-IPT]-((threadIdx.x==1)?0:sdata[(idx[threadIdx.x-1]-1)+TPB)*IPT;
    __同步线程();
    线程_数据[0]=sdata[threadIdx.x*IPT];
    块扫描(临时存储)。包括ESUM(线程数据、线程数据);
    __同步线程();
    数据[threadIdx.x]=线程_数据[0];
    }
    typedef int-dtype;
    常数int nTPB=256;
    int main(){
    int h_idx[]={0,4,7,32,55,99,104,200};
    int n=sizeof(h_idx)/sizeof(h_idx[0]);
    std::cout将索引(偏移量)数组设为idx[]。将数据数组设为A[],将扫描结果设为B[]

  • 扫描整个阵列A[],将输出存储在B[]中

  • 对于idx[i]处的每个元素,转到B[]中的索引-1,检索该值,然后使用idx[i-1]处的元素在B[]中索引-1并减去该值,然后从A[]中相同的索引idx[i](而非-1)中减去结果

  • 重新扫描A到B

  • 举个简单的例子:

    idx: 0 2 5
    
    0:  1  1  1  1  1  1  1  1
    1:  1  2  3  4  5  6  7  8
    2:  1  1 -1  1  1 -2  1  1
    3:  1  2  1  2  3  1  2  3
    
    在上述示例中,步骤2中的-1计算为步骤1索引(2-1)处的扫描值减去步骤1索引(0-1)处的扫描值(假设为零),然后从原始数据值中减去。步骤2中的-2计算为步骤1索引(5-1)处的扫描值减去步骤1索引(2-1)处的扫描值,从原始数据值中减去

    以下是一个例子:

    $ cat t453.cu
    #include <cub/cub.cuh>
    #include <iostream>
    
    template <int TPB, int IPT, typename T>
    __global__ void k(T *data, int *idx, int n){
    
        // Specialize BlockScan for a 1D block of TPB threads on type T
        __shared__ T sdata[TPB*IPT*2];
        sdata[threadIdx.x*IPT] = 1;
        __syncthreads();
        typedef cub::BlockScan<T, TPB> BlockScan;
        // Allocate shared memory for BlockScan
        __shared__ typename BlockScan::TempStorage temp_storage;
        // Obtain a segment of consecutive items that are blocked across threads
        int thread_data[IPT];
        thread_data[0] = sdata[threadIdx.x*IPT];
        // Collectively compute the block-wide exclusive prefix sum
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        sdata[IPT*(threadIdx.x+TPB)] = thread_data[0];
        if ((threadIdx.x < n) && (threadIdx.x > 0)) // assume the first element if idx points to 0
          sdata[idx[threadIdx.x]*IPT] -= (sdata[((idx[threadIdx.x]-1)+TPB)*IPT] - ((threadIdx.x == 1)?0:sdata[((idx[threadIdx.x-1]-1)+TPB)*IPT]));
        __syncthreads();
        thread_data[0] = sdata[threadIdx.x*IPT];
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        data[threadIdx.x] = thread_data[0];
    }
    
    typedef int dtype;
    const int nTPB = 256;
    
    int main(){
      int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200};
      int n = sizeof(h_idx)/sizeof(h_idx[0]);
      std::cout << "n = " << n << std::endl;
      int *d_idx;
      cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
      cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
      dtype *h_data, *d_data;
      h_data = new dtype[nTPB];
      cudaMalloc(&d_data, nTPB*sizeof(dtype));
      k<nTPB, 1><<<1,nTPB>>>(d_data, d_idx, n);
      cudaMemcpy(h_data, d_data, nTPB*sizeof(dtype), cudaMemcpyDeviceToHost);
      dtype sum;
      int idx = 0;
      for (int i = 0; i < nTPB; i++){
        if (i == h_idx[idx]) {sum = 0; idx++;}
        sum++;
        std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
      }
    }
    $ nvcc -o t453 t453.cu
    $ cuda-memcheck ./t453
    ========= CUDA-MEMCHECK
    n = 8
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    gpu: 57 cpu: 57
    gpu: 58 cpu: 58
    gpu: 59 cpu: 59
    gpu: 60 cpu: 60
    gpu: 61 cpu: 61
    gpu: 62 cpu: 62
    gpu: 63 cpu: 63
    gpu: 64 cpu: 64
    gpu: 65 cpu: 65
    gpu: 66 cpu: 66
    gpu: 67 cpu: 67
    gpu: 68 cpu: 68
    gpu: 69 cpu: 69
    gpu: 70 cpu: 70
    gpu: 71 cpu: 71
    gpu: 72 cpu: 72
    gpu: 73 cpu: 73
    gpu: 74 cpu: 74
    gpu: 75 cpu: 75
    gpu: 76 cpu: 76
    gpu: 77 cpu: 77
    gpu: 78 cpu: 78
    gpu: 79 cpu: 79
    gpu: 80 cpu: 80
    gpu: 81 cpu: 81
    gpu: 82 cpu: 82
    gpu: 83 cpu: 83
    gpu: 84 cpu: 84
    gpu: 85 cpu: 85
    gpu: 86 cpu: 86
    gpu: 87 cpu: 87
    gpu: 88 cpu: 88
    gpu: 89 cpu: 89
    gpu: 90 cpu: 90
    gpu: 91 cpu: 91
    gpu: 92 cpu: 92
    gpu: 93 cpu: 93
    gpu: 94 cpu: 94
    gpu: 95 cpu: 95
    gpu: 96 cpu: 96
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    ========= ERROR SUMMARY: 0 errors
    $
    
    $ cat t455.cu
    #include <cub/cub.cuh>
    #include <iostream>
    
    template <int TPB, int IPT, typename T>
    __global__ void k(T *data, int *idx){
        int lidx = threadIdx.x;
        // Specialize BlockScan for a 1D block of TPB threads on type T
        typedef cub::BlockScan<T, TPB> BlockScan;
        // Allocate shared memory for BlockScan
        __shared__ typename BlockScan::TempStorage temp_storage;
        // Obtain a segment of consecutive items that are blocked across threads
        int thread_data[IPT];
        thread_data[0] = ((lidx+idx[blockIdx.x])>=idx[blockIdx.x+1])?0:data[lidx+idx[blockIdx.x]];
        // Collectively compute the block-wide inclusive prefix sum
        BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
        __syncthreads();
        if ((lidx+idx[blockIdx.x]) < idx[blockIdx.x+1])
          data[lidx+idx[blockIdx.x]] = thread_data[0];
    }
    
    typedef int dtype;
    const int nTPB = 128; // sized with IPT to handle the largest segment
    const int DS = 256;
    int main(){
      int h_idx[] = {0, 4, 7, 32, 55, 99, 104, 200, 256};
      int n = sizeof(h_idx)/sizeof(h_idx[0]);
      std::cout << "n = " << n << std::endl;
      int *d_idx;
      cudaMalloc(&d_idx, n*sizeof(d_idx[0]));
      cudaMemcpy(d_idx, h_idx, n*sizeof(h_idx[0]), cudaMemcpyHostToDevice);
      dtype *h_data, *d_data;
      h_data = new dtype[DS];
      for (int i = 0; i < DS; i++) h_data[i] = 1;
      cudaMalloc(&d_data, DS*sizeof(dtype));
      cudaMemcpy(d_data, h_data, DS*sizeof(h_data[0]), cudaMemcpyHostToDevice);
      k<nTPB, 1><<<n-1,nTPB>>>(d_data, d_idx);
      cudaMemcpy(h_data, d_data, DS*sizeof(dtype), cudaMemcpyDeviceToHost);
      dtype sum;
      int idx = 0;
      for (int i = 0; i < DS; i++){
        if (i == h_idx[idx]) {sum = 0; idx++;}
        sum++;
        std::cout << "gpu: " << h_data[i] << " cpu: " << sum << std::endl;
      }
    }
    $ nvcc -o t455 t455.cu
    $ cuda-memcheck ./t455
    ========= CUDA-MEMCHECK
    n = 9
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    gpu: 57 cpu: 57
    gpu: 58 cpu: 58
    gpu: 59 cpu: 59
    gpu: 60 cpu: 60
    gpu: 61 cpu: 61
    gpu: 62 cpu: 62
    gpu: 63 cpu: 63
    gpu: 64 cpu: 64
    gpu: 65 cpu: 65
    gpu: 66 cpu: 66
    gpu: 67 cpu: 67
    gpu: 68 cpu: 68
    gpu: 69 cpu: 69
    gpu: 70 cpu: 70
    gpu: 71 cpu: 71
    gpu: 72 cpu: 72
    gpu: 73 cpu: 73
    gpu: 74 cpu: 74
    gpu: 75 cpu: 75
    gpu: 76 cpu: 76
    gpu: 77 cpu: 77
    gpu: 78 cpu: 78
    gpu: 79 cpu: 79
    gpu: 80 cpu: 80
    gpu: 81 cpu: 81
    gpu: 82 cpu: 82
    gpu: 83 cpu: 83
    gpu: 84 cpu: 84
    gpu: 85 cpu: 85
    gpu: 86 cpu: 86
    gpu: 87 cpu: 87
    gpu: 88 cpu: 88
    gpu: 89 cpu: 89
    gpu: 90 cpu: 90
    gpu: 91 cpu: 91
    gpu: 92 cpu: 92
    gpu: 93 cpu: 93
    gpu: 94 cpu: 94
    gpu: 95 cpu: 95
    gpu: 96 cpu: 96
    gpu: 1 cpu: 1
    gpu: 2 cpu: 2
    gpu: 3 cpu: 3
    gpu: 4 cpu: 4
    gpu: 5 cpu: 5
    gpu: 6 cpu: 6
    gpu: 7 cpu: 7
    gpu: 8 cpu: 8
    gpu: 9 cpu: 9
    gpu: 10 cpu: 10
    gpu: 11 cpu: 11
    gpu: 12 cpu: 12
    gpu: 13 cpu: 13
    gpu: 14 cpu: 14
    gpu: 15 cpu: 15
    gpu: 16 cpu: 16
    gpu: 17 cpu: 17
    gpu: 18 cpu: 18
    gpu: 19 cpu: 19
    gpu: 20 cpu: 20
    gpu: 21 cpu: 21
    gpu: 22 cpu: 22
    gpu: 23 cpu: 23
    gpu: 24 cpu: 24
    gpu: 25 cpu: 25
    gpu: 26 cpu: 26
    gpu: 27 cpu: 27
    gpu: 28 cpu: 28
    gpu: 29 cpu: 29
    gpu: 30 cpu: 30
    gpu: 31 cpu: 31
    gpu: 32 cpu: 32
    gpu: 33 cpu: 33
    gpu: 34 cpu: 34
    gpu: 35 cpu: 35
    gpu: 36 cpu: 36
    gpu: 37 cpu: 37
    gpu: 38 cpu: 38
    gpu: 39 cpu: 39
    gpu: 40 cpu: 40
    gpu: 41 cpu: 41
    gpu: 42 cpu: 42
    gpu: 43 cpu: 43
    gpu: 44 cpu: 44
    gpu: 45 cpu: 45
    gpu: 46 cpu: 46
    gpu: 47 cpu: 47
    gpu: 48 cpu: 48
    gpu: 49 cpu: 49
    gpu: 50 cpu: 50
    gpu: 51 cpu: 51
    gpu: 52 cpu: 52
    gpu: 53 cpu: 53
    gpu: 54 cpu: 54
    gpu: 55 cpu: 55
    gpu: 56 cpu: 56
    ========= ERROR SUMMARY: 0 errors
    $
    
    $cat t453.cu
    #包括
    #包括
    模板
    __全局无效k(T*数据,int*idx,int n){
    //专门化T型上TPB线程的1D块的块扫描
    __共享sdata[TPB*IPT*2];
    sdata[threadIdx.x*IPT]=1;
    __同步线程();
    typedef cub::BlockScan BlockScan;
    //为块扫描分配共享内存
    __共享类型名称块扫描::临时存储临时存储;
    //获取跨线程阻塞的连续项目段
    int线程_数据[IPT];
    线程_数据[0]=sdata[threadIdx.x*IPT];
    //集体计算块范围的独占前缀和
    块扫描(临时存储)。包括ESUM(线程数据、线程数据);
    __同步线程();
    sdata[IPT*(threadIdx.x+TPB)]=线程数据[0];
    if((threadIdx.x0))//如果idx指向0,则假定第一个元素
    sdata[idx[threadIdx.x]*IPT]=(sdata[(idx[threadIdx.x]-1)+TPB)*-IPT]-((threadIdx.x==1)?0:sdata[(idx[threadIdx.x-1]-1)+TPB)*IPT;
    __同步线程();
    线程_数据[0]=sdata[threadIdx.x*IPT];
    块扫描(临时存储)。包括ESUM(线程数据、线程数据);
    __同步线程();
    数据[threadIdx.x]=线程_数据[0];
    }
    typedef int-dtype;
    常数int nTPB=256;
    int main(){
    int h_idx[]={0,4,7,32,55,99,104,200};
    int n=sizeof(h_idx)/sizeof(h_idx[0]);
    
    std::cout感谢您发布回复。我正在处理的数据非常大。这样的方法可能可行,但我仍然需要以某种方式分解数据。不过,这可能是唯一的操作方法--执行块扫描,然后修复超量。您特别问到了使用
    块扫描
    。这就是仅适用于threadblock级别。您在描述中给出的实际数字对于block si是合理的