C++ CUDA：在继续之前，有没有办法强制完成每一行？_C++_Cuda

C++ CUDA：在继续之前，有没有办法强制完成每一行？

c++ cuda

C++ CUDA：在继续之前，有没有办法强制完成每一行？,c++,cuda,C++,Cuda,我是并行编程新手，非常感谢您帮助我理解并行编程的工作原理。这是一个人为的例子，我希望矩阵的每个单元格中的运算结果为50 结果取决于[index+1]处数组中的值。这在并行编程中效果不太好，因为值不是按顺序计算的，而且每隔几个单元格就会得到不正确的结果。我使用的创可贴是将函数拆分为多个函数，但我认为应该有更好的解决方案，尽管我不确定要搜索什么。多谢各位 CUDA代码： #include "cuda_runtime.h" #include "device_launch_parameters.h" #

我是并行编程新手，非常感谢您帮助我理解并行编程的工作原理。这是一个人为的例子，我希望矩阵的每个单元格中的运算结果为50

结果取决于[index+1]处数组中的值。这在并行编程中效果不太好，因为值不是按顺序计算的，而且每隔几个单元格就会得到不正确的结果。我使用的创可贴是将函数拆分为多个函数，但我认为应该有更好的解决方案，尽管我不确定要搜索什么。多谢各位

CUDA代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>

#include <iostream>

#define TILE_WIDTH 16

using namespace std;

__global__ void cuda_arithmetic(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < width) && h < (height))                    //initial=20, B=2, C=10, result = 17;
        initial_array[index] = powf(C[index],2);

    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}

__global__ void cuda_arithmetic_step_1(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < width) && h < (height))
        initial_array[index] = powf(C[index],2);
}

__global__ void cuda_arithmetic_step_2(int height, int width, float *B, float *C, float *initial_array, float *result_array){

    int             w                   =   blockIdx.x * blockDim.x + threadIdx.x; // Col // width
    int             h                   =   blockIdx.y * blockDim.y + threadIdx.y; // Row // height
    int             index               =   h * width + w;

    if ((w < (width-1)) && h < (height))
        result_array[index] = initial_array[index+1] / B[index];
}

int main(){

    int             height              =   800;
    int             width               =   8192;

    float           *A                  =   new float[height * width];
    float           *B                  =   new float[height * width];
    float           *C                  =   new float[height * width];
    float           *result             =   new float[height * width];

    for (int i = 0; i < height; i++){
        for (int j = 0; j < width; j++){
            A[i*width+j] = 20;
            B[i*width+j] = 2;
            C[i*width+j] = 10;
            result[i*width+j] = 17;
        }
    }

    float           *gpu_A;
    float           *gpu_B;
    float           *gpu_C;
    float           *gpu_result;

    cudaMalloc((void **)&gpu_A,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_B,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_C,         (height * width * sizeof(float)));
    cudaMalloc((void **)&gpu_result,    (height * width * sizeof(float)));

    cudaMemcpy(gpu_A,       A,          (height * width * sizeof(float)), cudaMemcpyHostToDevice);
    cudaMemcpy(gpu_B,       B,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_C,       C,          (height * width * sizeof(float)), cudaMemcpyHostToDevice); 
    cudaMemcpy(gpu_result,  result,     (height * width * sizeof(float)), cudaMemcpyHostToDevice);

    dim3            dimGrid((width - 1) / TILE_WIDTH + 1, (height - 1)/TILE_WIDTH + 1, 1);
    dim3            dimBlock(TILE_WIDTH, TILE_WIDTH, 1);

    // CODE OPTION

    // incorrect result
    cuda_arithmetic<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);

    // correct result
    //cuda_arithmetic_step_1<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);
    //cuda_arithmetic_step_2<<<dimGrid,dimBlock>>>(height, width, gpu_B, gpu_C, gpu_A, gpu_result);

    cudaMemcpy(result, gpu_result, (height * width * sizeof(float)), cudaMemcpyDeviceToHost);

    for (int i = 0; i < height; i++){
        for (int j = 0; j < (width-1); j++){
            if (abs((result[i*(width-1)+j] - 50)) > 0.001){
                cout << "error: ";
                cout << i << " * " << width-1 << " + " << j << ": " << result[i*(width-1)+j] << endl;
                system("pause");
            }
        }
        cout << endl;
    }
    cout << endl;

    cudaFree(gpu_A);
    cudaFree(gpu_B);
    cudaFree(gpu_C);
    cudaFree(gpu_result);

    delete[] A;
    delete[] B;
    delete[] C;
    delete[] result;

    system("pause");
}

#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#定义瓷砖宽度16
使用名称空间std；
__全局无效cuda算法（整数高度、整数宽度、浮点*B、浮点*C、浮点*初始数组、浮点*结果数组）{
int w=blockIdx.x*blockDim.x+threadIdx.x；//列//宽度
int h=blockIdx.y*blockDim.y+threadIdx.y；//行//高度
int index=h*宽度+w；
如果（（w<宽度）和&h<（高度））//初始值=20，B=2，C=10，结果=17；
初始_数组[index]=powf（C[index]，2）；
如果（（宽<（宽-1））&高<（高））
结果数组[index]=初始数组[index+1]/B[index]；
}
__全局\uuuuuvoid cuda\u算术\u步长\u 1（整数高度、整数宽度、浮点*B、浮点*C、浮点*初始数组、浮点*结果数组）{
int w=blockIdx.x*blockDim.x+threadIdx.x；//列//宽度
int h=blockIdx.y*blockDim.y+threadIdx.y；//行//高度
int index=h*宽度+w；
如果（（宽度<宽度）和高度<（高度））
初始_数组[index]=powf（C[index]，2）；
}
__全局\uuuuuvoid cuda\u算术\u步长\u2（整数高度、整数宽度、浮点*B、浮点*C、浮点*初始数组、浮点*结果数组）{
int w=blockIdx.x*blockDim.x+threadIdx.x；//列//宽度
int h=blockIdx.y*blockDim.y+threadIdx.y；//行//高度
int index=h*宽度+w；
如果（（宽<（宽-1））&高<（高））
结果数组[index]=初始数组[index+1]/B[index]；
}
int main（）{
内部高度=800；
整数宽度=8192；
浮动*A=新浮动[高度*宽度]；
浮动*B=新浮动[高度*宽度]；
浮动*C=新浮动[高度*宽度]；
浮动*结果=新浮动[高度*宽度]；
对于（int i=0；i0.001）{
不能使用\uuuu syncthreads（）；
。\uuu syncthreads
之前的所有代码将在执行之后的任何代码之前由该内核启动的所有线程中执行
还要记住，最好将读写操作分开，以避免冲突（当多个线程从同一地址读写时）
array[i] = array2[i]

应按要求返工
   temp = array2[i]; 
   __syncthreads();
   array[i] = temp;
   __syncthreads();

由于你的例子是人为的，我的回答会有点笼统
通常，您要处理的是全局同步问题
正如您所发现的，唯一干净的全局同步点是内核启动，因此在必要的同步点之前和之后将代码分成几部分将插入一个全局同步，这是由于内核启动
 >P>另一种方法是考虑必要的同步是否可以本地化。如果是这样，您可以查看安排算法/数据，以便必要的同步可以在线程块中处理（共享内存和给我们内置的协调/同步能力。这可能会对数据边界（例如，线程块间边界）带来一些挑战。处理边界数据的一种方法是让相邻的线程块在边界区域执行冗余计算，从而确保每个线程块在生成任何最终结果之前生成所有必要的中间结果
initial_array[index+1] = powf(C[index+1],2);

result_array[index] = initial_array[index+1] / B[index];

        if (abs((result[i*(width-1)+j] - 50)) > 0.001){

        if (abs((result[i*(width)+j] - 50)) > 0.001){