优化内核混洗密钥代码-OpenCL

优化内核混洗密钥代码-OpenCL,opencl,Opencl,我刚刚开始接触OpenCL,学习编写内核代码的基础知识。我已经编写了一个内核代码,用于计算点数组的无序键。因此,对于许多点N,洗牌键以3位方式计算,其中x位在深度d(0)处 下面给出了编写的内核代码。该点以列主格式输入 __constant float3 boundsOffsetTable[8] = { {-0.5,-0.5,-0.5}, {+0.5,-0.5,-0.5}, {-0.5,+0.5,-0.5},

我刚刚开始接触OpenCL,学习编写内核代码的基础知识。我已经编写了一个内核代码,用于计算点数组的无序键。因此,对于许多点N,洗牌键以3位方式计算,其中x位在深度d(0)处 下面给出了编写的内核代码。该点以列主格式输入

__constant float3 boundsOffsetTable[8] = {
              {-0.5,-0.5,-0.5},
              {+0.5,-0.5,-0.5},
              {-0.5,+0.5,-0.5},
              {-0.5,-0.5,+0.5},
              {+0.5,+0.5,-0.5},
              {+0.5,-0.5,+0.5},
              {-0.5,+0.5,+0.5},
              {+0.5,+0.5,+0.5}
};
uint setBit(uint x,unsigned char position)
{
uint mask = 1<<position;
return x|mask;
}

__kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt; 
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
{
    for(int i=0;i<8;i++)
    {
        newRadius = radius *0.5;
        newCenter = center + boundOffsetTable[i]*radius;
        if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
        {
            if(pt.x>newCenter.x) code = setBit(code,3*l);
            if(pt.y>newCenter.y) code = setBit(code,3*l+1);
            if(pt.z>newCenter.z) code = setBit(code,3*l+2);
        }
    }
}
}
\uuuu常量float3边界可设置[8]={
{-0.5,-0.5,-0.5},
{+0.5,-0.5,-0.5},
{-0.5,+0.5,-0.5},
{-0.5,-0.5,+0.5},
{+0.5,+0.5,-0.5},
{+0.5,-0.5,+0.5},
{-0.5,+0.5,+0.5},
{+0.5,+0.5,+0.5}
};
uint setBit(uint x,无符号字符位置)
{
uint mask=1center.y)代码=setBit(代码,1);
如果(pt.z>center.z)代码=setBit(代码,2);
对于(int l=1;lnewCenter.y)代码=setBit(代码,3*l+1);
如果(pt.z>newCenter.z)代码=setBit(代码,3*l+2);
}
}
}
}
它可以工作,但我只是想问一下代码中是否缺少一些东西,以及是否有办法优化代码。

试试这个内核:

__kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt; 
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
{
    for(int i=0;i<8;i++)
    {
        newRadius = radius *0.5;
        newCenter = center + boundOffsetTable[i]*radius;
        if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
        {
            if(pt.x>newCenter.x) res = setBit(res,3*l);
            if(pt.y>newCenter.y) res = setBit(res,3*l+1);
            if(pt.z>newCenter.z) res = setBit(res,3*l+2);
        }
    }
}
//Save the result
code[i] = res;
}
\uuuuuuu内核无效morton\u代码(\uuuuu全局浮点*点,\uuuuu全局uint*代码,整数级,浮点3中心,浮点半径,整数大小){
//获取要处理的当前元素的索引
int i=获取全局id(0);
3pt;
点x=点[i];点y=点[size+i];点z=点[2*size+i];
uint res;
res=0;
新中心3号;
浮动半径;
如果(pt.x>center.x)res=setBit(res,0);
如果(pt.y>center.y)res=setBit(res,1);
如果(pt.z>center.z)res=setBit(res,2);
对于(int l=1;lnewCenter.y)res=setBit(res,3*l+1);
如果(pt.z>newCenter.z)res=setBit(res,3*l+2);
}
}
}
//保存结果
代码[i]=res;
}
要优化的规则:

  • 避免使用全局内存(您直接从全局内存使用“代码”,我改变了这一点),您现在应该看到性能提高了3倍
  • 避免使用Ifs,如果可能,请使用“选择”。(参见OpenCL文档)
  • 在内核中使用更多内存。您不需要在位级别进行操作。int级别的操作会更好,并且可以避免大量的“setBit”调用。然后你可以在最后构建你的结果

  • 另一件有趣的事。如果您是在3D级别操作,您可以使用float3变量并使用OpenCL操作符计算距离。这可以大大提高您的性能。但是也需要完全重写内核。

    当前的执行时间是多少,您希望代码的速度快多少,以及您是否希望代码平台特定?
    __constant float3 boundsOffsetTable[8] = {
                  {-0.5,-0.5,-0.5},
                  {+0.5,-0.5,-0.5},
                  {-0.5,+0.5,-0.5},
                  {-0.5,-0.5,+0.5},
                  {+0.5,+0.5,-0.5},
                  {+0.5,-0.5,+0.5},
                  {-0.5,+0.5,+0.5},
                  {+0.5,+0.5,+0.5}
    };
    uint setBit(uint x,unsigned char position)
    {
    uint mask = 1<<position;
    return x|mask;
    }
    
    __kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
    // Get the index of the current element to be processed
    int i = get_global_id(0);
    float3 pt; 
    pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
    code[i] = 0;
    float3 newCenter;
    float newRadius;
    if(pt.x>center.x) code = setBit(code,0);
    if(pt.y>center.y) code = setBit(code,1);
    if(pt.z>center.z) code = setBit(code,2);
    for(int l = 1;l<level;l++)
    {
        for(int i=0;i<8;i++)
        {
            newRadius = radius *0.5;
            newCenter = center + boundOffsetTable[i]*radius;
            if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
            {
                if(pt.x>newCenter.x) code = setBit(code,3*l);
                if(pt.y>newCenter.y) code = setBit(code,3*l+1);
                if(pt.z>newCenter.z) code = setBit(code,3*l+2);
            }
        }
    }
    }
    
    __kernel void morton_code(__global float* point,__global uint*code,int level, float3          center,float radius,int size){
    // Get the index of the current element to be processed
    int i = get_global_id(0);
    float3 pt; 
    pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
    uint res;
    res = 0;
    float3 newCenter;
    float newRadius;
    if(pt.x>center.x) res = setBit(res,0);
    if(pt.y>center.y) res = setBit(res,1);
    if(pt.z>center.z) res = setBit(res,2);
    for(int l = 1;l<level;l++)
    {
        for(int i=0;i<8;i++)
        {
            newRadius = radius *0.5;
            newCenter = center + boundOffsetTable[i]*radius;
            if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
            {
                if(pt.x>newCenter.x) res = setBit(res,3*l);
                if(pt.y>newCenter.y) res = setBit(res,3*l+1);
                if(pt.z>newCenter.z) res = setBit(res,3*l+2);
            }
        }
    }
    //Save the result
    code[i] = res;
    }