英特尔FPGA的OpenCL中本地内存阵列的Ram消耗如此之大

英特尔FPGA的OpenCL中本地内存阵列的Ram消耗如此之大,opencl,intel-fpga,Opencl,Intel Fpga,我用OpenCL为FPGA板编写了一个简单的代码。我使用DE10 nano仅共享板和Intel SDK 18.1。主要问题是Ram消耗过多。HTML报告主要显示本地内存数组中的问题。在ND范围的内核中,这个问题变得更糟 另一个问题是:所有本地数组都有一个编译器警告,即: (积极的编译器优化:删除本地内存中不必要的存储) 顺便说一下,在循环分析选项卡中有II:~1,在详细信息窗格中提到: (II是一个近似值,由以下可失速指令决定:加载操作#否,存储操作#否)。我怎样才能解决它并达到II的精确1 守

我用OpenCL为FPGA板编写了一个简单的代码。我使用DE10 nano仅共享板和Intel SDK 18.1。主要问题是Ram消耗过多。HTML报告主要显示本地内存数组中的问题。在ND范围的内核中,这个问题变得更糟

另一个问题是:所有本地数组都有一个编译器警告,即:

(积极的编译器优化:删除本地内存中不必要的存储)

顺便说一下,在循环分析选项卡中有II:~1,在详细信息窗格中提到:

(II是一个近似值,由以下可失速指令决定:加载操作#否,存储操作#否)。我怎样才能解决它并达到II的精确1
守则:

#define IDX(i, j, n) ((i) * (n) + (j))
//#include<stdlib.h>

__kernel void PushKernel( uint column,__global int * restrict height,
__global int * restrict excessFlow,__global int * restrict netFlowOutS,
__global int * restrict netFlowInT,uint s,uint t,uint row,
__global int * restrict residualFlow_up,__global int * restrict residualFlow_down,
__global int * restrict residualFlow_right,__global int * restrict residualFlow_left)
{
    const uint num_column=6;
    const uint num_row=4;
    int FlowOutS=*netFlowOutS;
    int FlowInT=*netFlowInT;
    uint source=s;
    uint destination=t;
    uint index;
    __local int heights_horizontal_cache[6];
    __local int excessFlow_horizontal_cache[6];
    __local int excessFlow_horizontal_cache_temp[6];
    __local int residualFlow_right_cache[6];
    __local int residualFlow_left_cache[6];
    __local int outS_cache;
    //#pragma unroll
    //#pragma loop_coalesce
    #pragma ivdep
    //#pragma ii 1
    for(int i=0; i<num_row; i++){index=IDX(i, 0, num_column);
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){//index=IDX(i, 0, num_column);
            heights_horizontal_cache[j]=height[index+j];
            excessFlow_horizontal_cache[j]=excessFlow[index+j];
            excessFlow_horizontal_cache_temp[j]=0;
            residualFlow_right_cache[j]=residualFlow_right[index+j];
            residualFlow_left_cache[j]=residualFlow_left[index+j];
            outS_cache=0;
        }
    
//mem_fence(CLK_GLOBAL_MEM_FENCE);
///////////////////////////////////////////////////////////////////////push to right
     
        //#pragma ivdep array (residualFlow_right_cache)  
        #pragma ivdep
        #pragma unroll
        for(int j=0; j<num_column-1; j++){
            //index=IDX(i, j, num_column);
            
            if(index+j != source && index+j != destination && excessFlow_horizontal_cache[j]>0 && residualFlow_right_cache[j]>0 && heights_horizontal_cache[j]==heights_horizontal_cache[j+1]+1){
                int delta = min(excessFlow_horizontal_cache[j], residualFlow_right_cache[j]);
                residualFlow_right_cache[j]-=delta; 
                residualFlow_left_cache[j+1]+=delta;    
                excessFlow_horizontal_cache[j]-=delta;
                
                //excessFlow_horizontal_cache[j+1]+=delta;
                excessFlow_horizontal_cache_temp[j+1]=delta;

                if (IDX(i, j+1, num_column) == s) {
                    //FlowOutS-=delta;
                    outS_cache=delta;
                } 
                else if (IDX(i, j+1, num_column) == t) {
                    FlowInT+=delta;}
            }
        
///////////////////////////////////////////////////////////////////////results back to global
//mem_fence(CLK_GLOBAL_MEM_FENCE);
        }
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){
            excessFlow_horizontal_cache[j]+=excessFlow_horizontal_cache_temp[j];
        }
        #pragma unroll
        #pragma ivdep
        for(int j=0; j<num_column; j++){
            //index=IDX(i, 0, num_column);
            excessFlow[index+j]=excessFlow_horizontal_cache[j];
            
            residualFlow_right[index+j]=residualFlow_right_cache[j];
            residualFlow_left[index+j]=residualFlow_left_cache[j];  
        }
    }
    FlowOutS-=outS_cache;
    *netFlowOutS=FlowOutS;
    *netFlowInT=FlowInT;
}
定义IDX(i,j,n)((i)*(n)+(j)) //#包括 __内核无效PushKernel(uint列,u全局int*限制高度, __全局整数*限制溢出,u全局整数*限制网络溢出, __全局整数*限制netFlowInT、uint s、uint t、uint行, __全局整数*限制剩余流量\向上,\全局整数*限制剩余流量\向下, __全局整数*限制剩余流量(右,\全局整数*限制剩余流量(左) { consuint num_column=6; consuint num_row=4; int流出=*净流出; int-FlowInT=*netFlowInT; uint源=s; uint目的地=t; uint指数; __本地整数高度\水平\缓存[6]; __本地int-excessFlow_水平_缓存[6]; __本地int-excessFlow_水平_-cache_-temp[6]; __本地int residualFlow_right_缓存[6]; __本地int residualFlow_left_缓存[6]; __本地int/u缓存; //#布拉格展开 //#pragma环聚结 #布拉格马伊夫代普 //#布拉格马ii 1 对于(int i=0;i