Synchronization 在OpenCL中为矩阵乘法示例实现互斥

Synchronization 在OpenCL中为矩阵乘法示例实现互斥,synchronization,opencl,mutex,fpga,Synchronization,Opencl,Mutex,Fpga,我正在尝试为OpenCL矩阵乘法示例实现互斥。其思想是为每一个人实现一个指令插入,添加一个条目,以便能够以原子方式打印乘法的数目。目前,我实现了一个自旋锁。内核的代码是 #include "../host/inc/matrixMult.h" #ifndef SIMD_WORK_ITEMS #define SIMD_WORK_ITEMS 4 // default value #endif __kernel __attribute((reqd_work_group_size(BLOCK_SIZE,

我正在尝试为OpenCL矩阵乘法示例实现互斥。其思想是为每一个人实现一个指令插入,添加一个条目,以便能够以原子方式打印乘法的数目。目前,我实现了一个自旋锁。内核的代码是

#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif

__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
             __global float *restrict C,
             __global float *A,
             __global float *B,
             __global double *num,
             // Widths of matrices.
             int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];

// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);

// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);

// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end   = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;

float running_sum = 0.0f;

// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
    // Load the matrices to local memory. Note that the (x, y) indices
    // are swapped for A_local and B_local. This affects the reads from
    // A_local and B_local below and result in more efficient hardware.
    //
    // This is actually an optimization that the compiler can perform,
    // but is shown here for illustration purposes.
    A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
    B_local[local_x][local_y] = B[b + B_width * local_y + local_x];

    // Wait for the entire block to be loaded.
    barrier(CLK_LOCAL_MEM_FENCE);

    // Do the dot product accumulation within this block. Fully unroll the loop.
    // As a result of the swap of indices above, memory accesses to
    // A_local and B_local are very efficient because each loop iteration
    // accesses consecutive elements. This can be seen by unrolling the
    // loop and analyzing the regions that are loaded:
    //  A_local[local_y][0..BLOCK_SIZE-1] and
    //  B_local[local_x][0..BLOCK_SIZE-1]

    #pragma unroll
   for (int k = 0; k < BLOCK_SIZE; ++k)
    {
    running_sum += A_local[local_y][k] * B_local[local_x][k];
   // Wait for the block to be fully consumed before loading the next
    // block.
    barrier(CLK_LOCAL_MEM_FENCE);
}

// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}

我想知道我的代码有什么问题,如何实现互斥?我的互斥实现是否错误?

我试图发表评论,但我的代表还不够。 不是精确的解决方案——或者答案——但是,你可以得到这个想法。 在fpga上下文原子中,互斥体在Xilinx实现上存在问题——没有实现


你能使用屏障吗?是的,我能。有没有办法使用屏障?我使用的是Altera board而不是XilinxSorry,我混淆了品牌。(同时,我的意图是评论,而不是回答)
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif

__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
             __global float *restrict C,
             __global float *A,
             __global float *B,
             __global double *num,
             // Widths of matrices.
             int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// ///////////////////////
__local mutex;
mutex = 0;

// ////////////////////////

// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);

// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);

// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end   = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;

float running_sum = 0.0f;

// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
    // Load the matrices to local memory. Note that the (x, y) indices
    // are swapped for A_local and B_local. This affects the reads from
    // A_local and B_local below and result in more efficient hardware.
    //
    // This is actually an optimization that the compiler can perform,
    // but is shown here for illustration purposes.
    A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
    B_local[local_x][local_y] = B[b + B_width * local_y + local_x];

    // Wait for the entire block to be loaded.
    barrier(CLK_LOCAL_MEM_FENCE);

    // Do the dot product accumulation within this block. Fully unroll the loop.
    // As a result of the swap of indices above, memory accesses to
    // A_local and B_local are very efficient because each loop iteration
    // accesses consecutive elements. This can be seen by unrolling the
    // loop and analyzing the regions that are loaded:
    //  A_local[local_y][0..BLOCK_SIZE-1] and
    //  B_local[local_x][0..BLOCK_SIZE-1]

    #pragma unroll
   for (int k = 0; k < BLOCK_SIZE; ++k)
    {
    running_sum += A_local[local_y][k] * B_local[local_x][k];
   // Mutex Implementation
    while(atomic_cmpxchg(&mutex, 0, 1) == 1);
        *num = *num+1; // = num_mul;
        atomic_xchg(&mutex, 0);
    }
   // End Mutex
   // Wait for the block to be fully consumed before loading the next
    // block.
    barrier(CLK_LOCAL_MEM_FENCE);
}

// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
aoc: OpenCL parser completed successfully.
  aoc: Optimizing and doing static analysis of code...
  aoc: Linking with IP library ...
  Checking if memory usage is larger than 100%
  Compiler Warning: Vectorized kernel contains loads/stores that cannot        be vectorized. This might reduce performance.
  aoc: First stage compilation completed successfully.
  Compiling for FPGA. This process may take a long time, please be patient.
  Error (293007): Current module quartus_map ended unexpectedly. Verify that you have sufficient memory available to compile your design.
 Error: Flow compile (for project /home/tanash/Music/matrix_mult_mutex_board_fp_no/bin/matrix_mult/top) was not successful
 Error: ERROR: Error(s) found while running an executable. See report file(s) for error message(s). Message log indicates which executable was run last.
 Error (23031): Evaluation of Tcl script /home/tanash/Build/intelFPGA/17.1/quartus/common/tcl/internal/qsh_flow.tcl unsuccessful
 Error: Quartus Prime Shell was unsuccessful. 4 errors, 2965 warnings
 Error: Compiler Error, not able to generate hardware