C++ CUDA减少-比赛条件？请考虑我从教程中获得的以下代码和附带的解释性图像。其目的是证明CUDA的并行减少 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <numeric> using namespace std; __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1) { // Thread must be allowed to write if (tid < tc) { // We need to do A + B, where B is the element following A, so first we // need to find the position of element A and of element B int posA = tid * stepSize * 2; int posB = posA + stepSize; // Update the value at posA by adding the value at posB to it d[posA] += d[posB]; } } } int main() { cudaError_t status; const int count = 8; const int size = count * sizeof(int); int* h = new int[count]; for (int i = 0; i < count; ++i) h[i] = i+1; int* d; status = cudaMalloc(&d, size); status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice); sumSingleBlock<<<1,count/2>>>(d); int result; status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost); cout << "Sum is " << result << endl; getchar(); cudaFree(d); delete [] h; return 0; } #包括“cuda_runtime.h” #包括“设备启动参数.h” #包括 #包括使用名称空间std； __全局无效单块（int*d） { int tid=threadIdx.x； //每次迭代中参与线程（tc）的一半数对于（int tc=blockDim.x，stepSize=1；tc>0；tc>>=1，stepSize_C++_Cuda

C++ CUDA减少-比赛条件？请考虑我从教程中获得的以下代码和附带的解释性图像。其目的是证明CUDA的并行减少 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <numeric> using namespace std; __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1) { // Thread must be allowed to write if (tid < tc) { // We need to do A + B, where B is the element following A, so first we // need to find the position of element A and of element B int posA = tid * stepSize * 2; int posB = posA + stepSize; // Update the value at posA by adding the value at posB to it d[posA] += d[posB]; } } } int main() { cudaError_t status; const int count = 8; const int size = count * sizeof(int); int* h = new int[count]; for (int i = 0; i < count; ++i) h[i] = i+1; int* d; status = cudaMalloc(&d, size); status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice); sumSingleBlock<<<1,count/2>>>(d); int result; status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost); cout << "Sum is " << result << endl; getchar(); cudaFree(d); delete [] h; return 0; } #包括“cuda_runtime.h” #包括“设备启动参数.h” #包括 #包括使用名称空间std； __全局无效单块（int*d） { int tid=threadIdx.x； //每次迭代中参与线程（tc）的一半数对于（int tc=blockDim.x，stepSize=1；tc>0；tc>>=1，stepSize

c++ cuda

C++ CUDA减少-比赛条件？请考虑我从教程中获得的以下代码和附带的解释性图像。其目的是证明CUDA的并行减少 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <numeric> using namespace std; __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1) { // Thread must be allowed to write if (tid < tc) { // We need to do A + B, where B is the element following A, so first we // need to find the position of element A and of element B int posA = tid * stepSize * 2; int posB = posA + stepSize; // Update the value at posA by adding the value at posB to it d[posA] += d[posB]; } } } int main() { cudaError_t status; const int count = 8; const int size = count * sizeof(int); int* h = new int[count]; for (int i = 0; i < count; ++i) h[i] = i+1; int* d; status = cudaMalloc(&d, size); status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice); sumSingleBlock<<<1,count/2>>>(d); int result; status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost); cout << "Sum is " << result << endl; getchar(); cudaFree(d); delete [] h; return 0; } #包括“cuda_runtime.h” #包括“设备启动参数.h” #包括 #包括使用名称空间std； __全局无效单块（int*d） { int tid=threadIdx.x； //每次迭代中参与线程（tc）的一半数对于（int tc=blockDim.x，stepSize=1；tc>0；tc>>=1，stepSize,c++,cuda,C++,Cuda,代码错误，需要调用\u syncthreads（），如下所示 __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<

代码错误，需要调用

\u syncthreads（）

，如下所示

__global__ void sumSingleBlock(int* d)
{
  int tid = threadIdx.x;

  // Number of participating threads (tc) halves on each iteration
  for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
  {
    // Thread must be allowed to write
    if (tid < tc)
    {
      // We need to do A + B, where B is the element following A, so first we 
      // need to find the position of element A and of element B      
      int posA = tid * stepSize * 2;
      int posB = posA + stepSize;

      // Update the value at posA by adding the value at posB to it
      d[posA] += d[posB];
    }
     __syncthreads();
  }
}

\uuuuu全局\uuuuu无效SUMSingelBlock（int*d）
{
int tid=threadIdx.x；
//每次迭代中参与线程（tc）的一半数
对于（int tc=blockDim.x，stepSize=1；tc>0；tc>>=1，stepSize是的，代码是错误的。在一般情况下需要同步。即使您希望定位扭曲同步行为，提供的代码也有问题。您可能应该使用更好的教程。NVIDIA有一个，并且有一个。此版本的教程缺少一个然而，对于warp synchronous部分的volatile
处理。感谢Robert Crovella；也许你可以将此作为答案发布？好奇的是，谁也否决了此项……我应该说，我已经挖掘并阅读了教程上发布的所有评论，其他人发现了这一点；作者回答说\u syncthreads（）；
调用应添加在内核中的if
语句之后。