0;tc>>=1,stepSize,c++,cuda,C++,Cuda" /> 0;tc>>=1,stepSize,c++,cuda,C++,Cuda" />

C++ CUDA减少-比赛条件? 请考虑我从教程中获得的以下代码和附带的解释性图像。其目的是证明CUDA的并行减少 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <numeric> using namespace std; __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1) { // Thread must be allowed to write if (tid < tc) { // We need to do A + B, where B is the element following A, so first we // need to find the position of element A and of element B int posA = tid * stepSize * 2; int posB = posA + stepSize; // Update the value at posA by adding the value at posB to it d[posA] += d[posB]; } } } int main() { cudaError_t status; const int count = 8; const int size = count * sizeof(int); int* h = new int[count]; for (int i = 0; i < count; ++i) h[i] = i+1; int* d; status = cudaMalloc(&d, size); status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice); sumSingleBlock<<<1,count/2>>>(d); int result; status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost); cout << "Sum is " << result << endl; getchar(); cudaFree(d); delete [] h; return 0; } #包括“cuda_runtime.h” #包括“设备启动参数.h” #包括 #包括 使用名称空间std; __全局无效单块(int*d) { int tid=threadIdx.x; //每次迭代中参与线程(tc)的一半数 对于(int tc=blockDim.x,stepSize=1;tc>0;tc>>=1,stepSize

C++ CUDA减少-比赛条件? 请考虑我从教程中获得的以下代码和附带的解释性图像。其目的是证明CUDA的并行减少 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <numeric> using namespace std; __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1) { // Thread must be allowed to write if (tid < tc) { // We need to do A + B, where B is the element following A, so first we // need to find the position of element A and of element B int posA = tid * stepSize * 2; int posB = posA + stepSize; // Update the value at posA by adding the value at posB to it d[posA] += d[posB]; } } } int main() { cudaError_t status; const int count = 8; const int size = count * sizeof(int); int* h = new int[count]; for (int i = 0; i < count; ++i) h[i] = i+1; int* d; status = cudaMalloc(&d, size); status = cudaMemcpy(d,h,size, cudaMemcpyHostToDevice); sumSingleBlock<<<1,count/2>>>(d); int result; status = cudaMemcpy(&result,d,sizeof(int),cudaMemcpyDeviceToHost); cout << "Sum is " << result << endl; getchar(); cudaFree(d); delete [] h; return 0; } #包括“cuda_runtime.h” #包括“设备启动参数.h” #包括 #包括 使用名称空间std; __全局无效单块(int*d) { int tid=threadIdx.x; //每次迭代中参与线程(tc)的一半数 对于(int tc=blockDim.x,stepSize=1;tc>0;tc>>=1,stepSize,c++,cuda,C++,Cuda,代码错误,需要调用\u syncthreads(),如下所示 __global__ void sumSingleBlock(int* d) { int tid = threadIdx.x; // Number of participating threads (tc) halves on each iteration for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<

代码错误,需要调用
\u syncthreads()
,如下所示

__global__ void sumSingleBlock(int* d)
{
  int tid = threadIdx.x;

  // Number of participating threads (tc) halves on each iteration
  for (int tc = blockDim.x, stepSize = 1; tc > 0; tc >>= 1, stepSize <<= 1)
  {
    // Thread must be allowed to write
    if (tid < tc)
    {
      // We need to do A + B, where B is the element following A, so first we 
      // need to find the position of element A and of element B      
      int posA = tid * stepSize * 2;
      int posB = posA + stepSize;

      // Update the value at posA by adding the value at posB to it
      d[posA] += d[posB];
    }
     __syncthreads();
  }
}
\uuuuu全局\uuuuu无效SUMSingelBlock(int*d)
{
int tid=threadIdx.x;
//每次迭代中参与线程(tc)的一半数

对于(int tc=blockDim.x,stepSize=1;tc>0;tc>>=1,stepSize是的,代码是错误的。在一般情况下需要同步。即使您希望定位扭曲同步行为,提供的代码也有问题。您可能应该使用更好的教程。NVIDIA有一个,并且有一个。此版本的教程缺少一个然而,对于warp synchronous部分的
volatile
处理。感谢Robert Crovella;也许你可以将此作为答案发布?好奇的是,谁也否决了此项……我应该说,我已经挖掘并阅读了教程上发布的所有评论,其他人发现了这一点;作者回答说
\u syncthreads();
调用应添加在内核中的
if
语句之后。