Cuda 希利斯；斯蒂尔：核函数_Cuda_Gpu

Cuda 希利斯；斯蒂尔：核函数

cuda

Cuda 希利斯；斯蒂尔：核函数,cuda,gpu,Cuda,Gpu,有人能帮助理解Hillis&Steele:内核函数如何为每个线程执行工作吗 __global__ void scan(float *g_odata, float *g_idata, int n) { extern __shared__ float temp[]; // allocated on invocation int thid = threadIdx.x; int pout = 0, pin = 1; // load input into shared m

有人能帮助理解Hillis&Steele:内核函数如何为每个线程执行工作吗

__global__ void scan(float *g_odata, float *g_idata, int n)
 {
    extern __shared__ float temp[]; // allocated on invocation
    int thid = threadIdx.x;
    int pout = 0, pin = 1;
    // load input into shared memory.
    // This is exclusive scan, so shift right by one and set first elt to 0
    temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;
    __syncthreads();
    for (int offset = 1; offset < n; offset *= 2)
    {
      pout = 1 - pout; // swap double buffer indices
      pin = 1 - pout;
      if (thid >= offset)
        temp[pout*n+thid] += temp[pin*n+thid - offset];
      else
        temp[pout*n+thid] = temp[pin*n+thid];
     __syncthreads();
    }
    g_odata[thid] = temp[pout*n+thid1]; // write output
}

\uuuu全局\uuuuu无效扫描（float*g\u odata，float*g\u idata，int n）
{
extern _ushared _uufloat temp[]；//在调用时分配
int thid=threadIdx.x；
int pout=0，pin=1；
//将输入加载到共享内存中。
//这是独占扫描，所以右移1并将first elt设置为0
温度[pout*n+thid]=（thid>0）？g_idata[thid-1]：0；
__同步线程（）；
对于（int offset=1；offset=偏移量）
温度[pout*n+thid]+=温度[pin*n+thid-偏移]；
其他的
温度[pout*n+thid]=温度[pin*n+thid]；
__同步线程（）；
}
g_odata[thid]=temp[pout*n+thid1]；//写入输出
}

从现在起，我了解了以下内容：首先，我们有

pout=0，pin=1和thid=[1，bockDim.x]

。因此，在第一次同步之前，我们有一个简单的右移，例如，如果我们有数组

[1 | 2 | 5 | 7]

，那么新数组就是

[0 | 1 | 2 | 5 | 7]

我认为循环< <代码> <代码>作为多个实例，每个实例为每个代码> thID。例如，如果

thId=0

我们将执行以下操作：

thid=0

```
offset=1
```
pout=1-0=1（在函数开始时使用pout初始化）
引脚=1-1=0；（使用刚刚计算的撅嘴，例如1）
临时[4]=临时[0]（else语句）
[0 | 1 | 2 | 5 | 0]
```
offset=2
```
pout=1-1=0（使用循环中上一步的pout）
引脚=1-0=1；（刚刚计算的值）
临时[0]=临时[4]（else语句）
[0 | 1 | 2 | 5 | 0]

pout和pin变量根据for循环中的信息而不是
在开始时考虑这些变量的初始化。用同样的方法
我设想执行

thid=1

thid=1

```
offset=1
```
pout=1-0=1（在函数开始时使用pout初始化）
引脚=1-1=0
温度[4+1]=温度[0+1-1]（if语句）温度中的内存超出范围

有人能给出一个直观的例子来说明它是如何执行的吗？另外，在执行最后一条代码语句时，将使用哪个pout值
如果你指的是并行扫描算法，你可以在这里看到一个直观的解释

我相信这些链接也有帮助

如果您指的是并行扫描算法，您可以在这里看到直观的解释

我相信这些链接也有帮助

好了，现在已经5年了。尽管如此，我还是想回答这个问题，因为我自己也花了一些时间来解决这个问题。也许这对某人还是有帮助的

关于“temp中的内存超出范围”：您需要分配两倍于您的输入数组
g_idata
的共享内存

在我看来，代码需要稍微更改才能正常工作。我附上了一个工作示例。使用nvcc-std=c++11编译

#包括 #包括 #包括 #包括 __全局无效scanHillisSteele（int*d\u out，int*d\u in，int n）{ int idx=threadIdx.x；外部共享内部温度[]； int pout=0，pin=1；温度[idx]=（idx>0）？[idx-1]中的d_:0； __同步线程（）；对于（int offset=1；offset=偏移量）{ temp[pout*n+idx]=temp[pin*n+idx-偏移量]+temp[pin*n+idx]；//行已更改 }否则{ 温度[pout*n+idx]=温度[pin*n+idx]； } __同步线程（）； } d_out[idx]=温度[pout*n+idx]； } int main（）{ 常量int数组_SIZE=10；常量int数组_字节=数组_大小*sizeof（int）； //在主机上生成输入数组 [ARRAY_SIZE]{1,2,5,7,8,10,11,12,15,19}中的int h_； int h_out[数组大小]； //声明GPU内存指针 int*d_in； int*d_out； //分配GPU内存 cudamaloc（（void**）和d_in，数组_字节）； cudamaloc（（void**）和d_out，数组字节）； //将阵列传输到GPU cudaMemcpy（d_-in，h_-in，ARRAY_字节，cudaMemcpyHostToDevice）； //启动内核 scanHillisSteele（输出、输入、阵列大小）； cudaDeviceSynchronize（）； //将生成的数组传输到cpu cudaMemcpy（h_out、d_out、数组字节、cudaMemcpyDeviceToHost）； //打印出输入和结果数组好了，现在已经5年了。不过，我想回答这个问题，因为我自己也花了一些时间来解决这个问题。也许这对某人还是有帮助的关于“temp中的内存超出范围”：您需要分配两倍于您的输入数组g_idata 的共享内存在我看来，代码需要稍微修改才能工作。我附上了一个工作示例。使用nvcc-std=c++11 编译 #包括 #包括 #包括 #包括 __全局无效scanHillisSteele（int*d\u out，int*d\u in，int n）{ int idx=threadIdx.x；外部共享内部温度[]； int pout=0，pin=1；温度[idx]=（idx>0）？[idx-1]中的d_:0； __同步线程（）；对于（int offset=1；offset=偏移量）{ temp[pout*n+idx]=temp[pin*n+idx-偏移量]+temp[pin*n+idx]；//行已更改 }否则{ 温度[pout*n+idx]=温度[pin*n+idx]； } __同步线程（）； } d_out[idx]=温度[pout*n+idx]； } int main（）{ 常数 #include <cuda.h> #include <cuda_runtime.h> #include <stdio.h> #include <iostream> __global__ void scanHillisSteele(int *d_out, int *d_in, int n) { int idx = threadIdx.x; extern __shared__ int temp[]; int pout = 0, pin = 1; temp[idx] = (idx > 0) ? d_in[idx - 1] : 0; __syncthreads(); for (int offset = 1; offset < n; offset *= 2) { // swap double buffer indices pout = 1 - pout; pin = 1 - pout; if (idx >= offset) { temp[pout*n+idx] = temp[pin*n+idx - offset] + temp[pin*n+idx]; // changed line } else { temp[pout*n+idx] = temp[pin*n+idx]; } __syncthreads(); } d_out[idx] = temp[pout*n+idx]; } int main() { const int ARRAY_SIZE = 10; const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); // generate the input array on the host int h_in[ARRAY_SIZE]{1, 2, 5, 7, 8, 10, 11, 12, 15, 19}; int h_out[ARRAY_SIZE]; // declare GPU memory pointers int * d_in; int * d_out; // allocate GPU memory cudaMalloc((void **) &d_in, ARRAY_BYTES); cudaMalloc((void **) &d_out, ARRAY_BYTES); // transfer the array to the GPU cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice); // launch the kernel scanHillisSteele<<<1, ARRAY_SIZE, 2 * ARRAY_BYTES>>>(d_out, d_in, ARRAY_SIZE); cudaDeviceSynchronize(); // transfer the resulting array to the cpu cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost); // print out the input and resulting array std::cout << "Input:" << std::endl; for (int i = 0; i < ARRAY_SIZE; ++i) { std::cout << h_in[i] << " " << std::flush; } std::cout << std::endl << "Exclusive scan with operation +:" << std::endl; for (int i = 0; i < ARRAY_SIZE; ++i) { std::cout << h_out[i] << " " << std::flush; } std::cout << std::endl; // free GPU memory allocation cudaFree(d_in); cudaFree(d_out); return 0; }