英特尔FPGA的OpenCL中本地内存阵列的Ram消耗如此之大
我用OpenCL为FPGA板编写了一个简单的代码。我使用DE10 nano仅共享板和Intel SDK 18.1。主要问题是Ram消耗过多。HTML报告主要显示本地内存数组中的问题。在ND范围的内核中,这个问题变得更糟 另一个问题是:所有本地数组都有一个编译器警告,即: (积极的编译器优化:删除本地内存中不必要的存储) 顺便说一下,在循环分析选项卡中有II:~1,在详细信息窗格中提到: (II是一个近似值,由以下可失速指令决定:加载操作#否,存储操作#否)。我怎样才能解决它并达到II的精确1英特尔FPGA的OpenCL中本地内存阵列的Ram消耗如此之大,opencl,intel-fpga,Opencl,Intel Fpga,我用OpenCL为FPGA板编写了一个简单的代码。我使用DE10 nano仅共享板和Intel SDK 18.1。主要问题是Ram消耗过多。HTML报告主要显示本地内存数组中的问题。在ND范围的内核中,这个问题变得更糟 另一个问题是:所有本地数组都有一个编译器警告,即: (积极的编译器优化:删除本地内存中不必要的存储) 顺便说一下,在循环分析选项卡中有II:~1,在详细信息窗格中提到: (II是一个近似值,由以下可失速指令决定:加载操作#否,存储操作#否)。我怎样才能解决它并达到II的精确1 守
守则:
#define IDX(i, j, n) ((i) * (n) + (j))
//#include<stdlib.h>
__kernel void PushKernel( uint column,__global int * restrict height,
__global int * restrict excessFlow,__global int * restrict netFlowOutS,
__global int * restrict netFlowInT,uint s,uint t,uint row,
__global int * restrict residualFlow_up,__global int * restrict residualFlow_down,
__global int * restrict residualFlow_right,__global int * restrict residualFlow_left)
{
const uint num_column=6;
const uint num_row=4;
int FlowOutS=*netFlowOutS;
int FlowInT=*netFlowInT;
uint source=s;
uint destination=t;
uint index;
__local int heights_horizontal_cache[6];
__local int excessFlow_horizontal_cache[6];
__local int excessFlow_horizontal_cache_temp[6];
__local int residualFlow_right_cache[6];
__local int residualFlow_left_cache[6];
__local int outS_cache;
//#pragma unroll
//#pragma loop_coalesce
#pragma ivdep
//#pragma ii 1
for(int i=0; i<num_row; i++){index=IDX(i, 0, num_column);
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){//index=IDX(i, 0, num_column);
heights_horizontal_cache[j]=height[index+j];
excessFlow_horizontal_cache[j]=excessFlow[index+j];
excessFlow_horizontal_cache_temp[j]=0;
residualFlow_right_cache[j]=residualFlow_right[index+j];
residualFlow_left_cache[j]=residualFlow_left[index+j];
outS_cache=0;
}
//mem_fence(CLK_GLOBAL_MEM_FENCE);
///////////////////////////////////////////////////////////////////////push to right
//#pragma ivdep array (residualFlow_right_cache)
#pragma ivdep
#pragma unroll
for(int j=0; j<num_column-1; j++){
//index=IDX(i, j, num_column);
if(index+j != source && index+j != destination && excessFlow_horizontal_cache[j]>0 && residualFlow_right_cache[j]>0 && heights_horizontal_cache[j]==heights_horizontal_cache[j+1]+1){
int delta = min(excessFlow_horizontal_cache[j], residualFlow_right_cache[j]);
residualFlow_right_cache[j]-=delta;
residualFlow_left_cache[j+1]+=delta;
excessFlow_horizontal_cache[j]-=delta;
//excessFlow_horizontal_cache[j+1]+=delta;
excessFlow_horizontal_cache_temp[j+1]=delta;
if (IDX(i, j+1, num_column) == s) {
//FlowOutS-=delta;
outS_cache=delta;
}
else if (IDX(i, j+1, num_column) == t) {
FlowInT+=delta;}
}
///////////////////////////////////////////////////////////////////////results back to global
//mem_fence(CLK_GLOBAL_MEM_FENCE);
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
excessFlow_horizontal_cache[j]+=excessFlow_horizontal_cache_temp[j];
}
#pragma unroll
#pragma ivdep
for(int j=0; j<num_column; j++){
//index=IDX(i, 0, num_column);
excessFlow[index+j]=excessFlow_horizontal_cache[j];
residualFlow_right[index+j]=residualFlow_right_cache[j];
residualFlow_left[index+j]=residualFlow_left_cache[j];
}
}
FlowOutS-=outS_cache;
*netFlowOutS=FlowOutS;
*netFlowInT=FlowInT;
}
定义IDX(i,j,n)((i)*(n)+(j))
//#包括
__内核无效PushKernel(uint列,u全局int*限制高度,
__全局整数*限制溢出,u全局整数*限制网络溢出,
__全局整数*限制netFlowInT、uint s、uint t、uint行,
__全局整数*限制剩余流量\向上,\全局整数*限制剩余流量\向下,
__全局整数*限制剩余流量(右,\全局整数*限制剩余流量(左)
{
consuint num_column=6;
consuint num_row=4;
int流出=*净流出;
int-FlowInT=*netFlowInT;
uint源=s;
uint目的地=t;
uint指数;
__本地整数高度\水平\缓存[6];
__本地int-excessFlow_水平_缓存[6];
__本地int-excessFlow_水平_-cache_-temp[6];
__本地int residualFlow_right_缓存[6];
__本地int residualFlow_left_缓存[6];
__本地int/u缓存;
//#布拉格展开
//#pragma环聚结
#布拉格马伊夫代普
//#布拉格马ii 1
对于(int i=0;i