Synchronization 在OpenCL中为矩阵乘法示例实现互斥
我正在尝试为OpenCL矩阵乘法示例实现互斥。其思想是为每一个人实现一个指令插入,添加一个条目,以便能够以原子方式打印乘法的数目。目前,我实现了一个自旋锁。内核的代码是Synchronization 在OpenCL中为矩阵乘法示例实现互斥,synchronization,opencl,mutex,fpga,Synchronization,Opencl,Mutex,Fpga,我正在尝试为OpenCL矩阵乘法示例实现互斥。其思想是为每一个人实现一个指令插入,添加一个条目,以便能够以原子方式打印乘法的数目。目前,我实现了一个自旋锁。内核的代码是 #include "../host/inc/matrixMult.h" #ifndef SIMD_WORK_ITEMS #define SIMD_WORK_ITEMS 4 // default value #endif __kernel __attribute((reqd_work_group_size(BLOCK_SIZE,
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
__global double *num,
// Widths of matrices.
int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
// Load the matrices to local memory. Note that the (x, y) indices
// are swapped for A_local and B_local. This affects the reads from
// A_local and B_local below and result in more efficient hardware.
//
// This is actually an optimization that the compiler can perform,
// but is shown here for illustration purposes.
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
barrier(CLK_LOCAL_MEM_FENCE);
// Do the dot product accumulation within this block. Fully unroll the loop.
// As a result of the swap of indices above, memory accesses to
// A_local and B_local are very efficient because each loop iteration
// accesses consecutive elements. This can be seen by unrolling the
// loop and analyzing the regions that are loaded:
// A_local[local_y][0..BLOCK_SIZE-1] and
// B_local[local_x][0..BLOCK_SIZE-1]
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
// Wait for the block to be fully consumed before loading the next
// block.
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
我想知道我的代码有什么问题,如何实现互斥?我的互斥实现是否错误?我试图发表评论,但我的代表还不够。 不是精确的解决方案——或者答案——但是,你可以得到这个想法。 在fpga上下文原子中,互斥体在Xilinx实现上存在问题——没有实现
你能使用屏障吗?是的,我能。有没有办法使用屏障?我使用的是Altera board而不是XilinxSorry,我混淆了品牌。(同时,我的意图是评论,而不是回答)
#include "../host/inc/matrixMult.h"
#ifndef SIMD_WORK_ITEMS
#define SIMD_WORK_ITEMS 4 // default value
#endif
__kernel
__attribute((reqd_work_group_size(BLOCK_SIZE,BLOCK_SIZE,1)))
__attribute((num_simd_work_items(SIMD_WORK_ITEMS)))
void matrixMult( // Input and output matrices
__global float *restrict C,
__global float *A,
__global float *B,
__global double *num,
// Widths of matrices.
int A_width, int B_width){
// Local storage for a block of input matrices A and B
__local float A_local[BLOCK_SIZE][BLOCK_SIZE];
__local float B_local[BLOCK_SIZE][BLOCK_SIZE];
// ///////////////////////
__local mutex;
mutex = 0;
// ////////////////////////
// Block index
int block_x = get_group_id(0);
int block_y = get_group_id(1);
// Local ID index (offset within a block)
int local_x = get_local_id(0);
int local_y = get_local_id(1);
// Compute loop bounds
int a_start = A_width * BLOCK_SIZE * block_y;
int a_end = a_start + A_width - 1;
int b_start = BLOCK_SIZE * block_x;
float running_sum = 0.0f;
// Compute the matrix multiplication result for this output element. Each
// loop iteration processes one block of the matrix.
for (int a = a_start, b = b_start; a <= a_end; a += BLOCK_SIZE, b += (BLOCK_SIZE * B_width))
{
// Load the matrices to local memory. Note that the (x, y) indices
// are swapped for A_local and B_local. This affects the reads from
// A_local and B_local below and result in more efficient hardware.
//
// This is actually an optimization that the compiler can perform,
// but is shown here for illustration purposes.
A_local[local_y][local_x] = A[a + A_width * local_y + local_x];
B_local[local_x][local_y] = B[b + B_width * local_y + local_x];
// Wait for the entire block to be loaded.
barrier(CLK_LOCAL_MEM_FENCE);
// Do the dot product accumulation within this block. Fully unroll the loop.
// As a result of the swap of indices above, memory accesses to
// A_local and B_local are very efficient because each loop iteration
// accesses consecutive elements. This can be seen by unrolling the
// loop and analyzing the regions that are loaded:
// A_local[local_y][0..BLOCK_SIZE-1] and
// B_local[local_x][0..BLOCK_SIZE-1]
#pragma unroll
for (int k = 0; k < BLOCK_SIZE; ++k)
{
running_sum += A_local[local_y][k] * B_local[local_x][k];
// Mutex Implementation
while(atomic_cmpxchg(&mutex, 0, 1) == 1);
*num = *num+1; // = num_mul;
atomic_xchg(&mutex, 0);
}
// End Mutex
// Wait for the block to be fully consumed before loading the next
// block.
barrier(CLK_LOCAL_MEM_FENCE);
}
// Store result in matrix C
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = running_sum;
}
aoc: OpenCL parser completed successfully.
aoc: Optimizing and doing static analysis of code...
aoc: Linking with IP library ...
Checking if memory usage is larger than 100%
Compiler Warning: Vectorized kernel contains loads/stores that cannot be vectorized. This might reduce performance.
aoc: First stage compilation completed successfully.
Compiling for FPGA. This process may take a long time, please be patient.
Error (293007): Current module quartus_map ended unexpectedly. Verify that you have sufficient memory available to compile your design.
Error: Flow compile (for project /home/tanash/Music/matrix_mult_mutex_board_fp_no/bin/matrix_mult/top) was not successful
Error: ERROR: Error(s) found while running an executable. See report file(s) for error message(s). Message log indicates which executable was run last.
Error (23031): Evaluation of Tcl script /home/tanash/Build/intelFPGA/17.1/quartus/common/tcl/internal/qsh_flow.tcl unsuccessful
Error: Quartus Prime Shell was unsuccessful. 4 errors, 2965 warnings
Error: Compiler Error, not able to generate hardware