优化内核混洗密钥代码-OpenCL
我刚刚开始接触OpenCL,学习编写内核代码的基础知识。我已经编写了一个内核代码,用于计算点数组的无序键。因此,对于许多点N,洗牌键以3位方式计算,其中x位在深度d(0)处 下面给出了编写的内核代码。该点以列主格式输入优化内核混洗密钥代码-OpenCL,opencl,Opencl,我刚刚开始接触OpenCL,学习编写内核代码的基础知识。我已经编写了一个内核代码,用于计算点数组的无序键。因此,对于许多点N,洗牌键以3位方式计算,其中x位在深度d(0)处 下面给出了编写的内核代码。该点以列主格式输入 __constant float3 boundsOffsetTable[8] = { {-0.5,-0.5,-0.5}, {+0.5,-0.5,-0.5}, {-0.5,+0.5,-0.5},
__constant float3 boundsOffsetTable[8] = {
{-0.5,-0.5,-0.5},
{+0.5,-0.5,-0.5},
{-0.5,+0.5,-0.5},
{-0.5,-0.5,+0.5},
{+0.5,+0.5,-0.5},
{+0.5,-0.5,+0.5},
{-0.5,+0.5,+0.5},
{+0.5,+0.5,+0.5}
};
uint setBit(uint x,unsigned char position)
{
uint mask = 1<<position;
return x|mask;
}
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) code = setBit(code,3*l);
if(pt.y>newCenter.y) code = setBit(code,3*l+1);
if(pt.z>newCenter.z) code = setBit(code,3*l+2);
}
}
}
}
\uuuu常量float3边界可设置[8]={
{-0.5,-0.5,-0.5},
{+0.5,-0.5,-0.5},
{-0.5,+0.5,-0.5},
{-0.5,-0.5,+0.5},
{+0.5,+0.5,-0.5},
{+0.5,-0.5,+0.5},
{-0.5,+0.5,+0.5},
{+0.5,+0.5,+0.5}
};
uint setBit(uint x,无符号字符位置)
{
uint mask=1center.y)代码=setBit(代码,1);
如果(pt.z>center.z)代码=setBit(代码,2);
对于(int l=1;lnewCenter.y)代码=setBit(代码,3*l+1);
如果(pt.z>newCenter.z)代码=setBit(代码,3*l+2);
}
}
}
}
它可以工作,但我只是想问一下代码中是否缺少一些东西,以及是否有办法优化代码。试试这个内核:
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) res = setBit(res,3*l);
if(pt.y>newCenter.y) res = setBit(res,3*l+1);
if(pt.z>newCenter.z) res = setBit(res,3*l+2);
}
}
}
//Save the result
code[i] = res;
}
\uuuuuuu内核无效morton\u代码(\uuuuu全局浮点*点,\uuuuu全局uint*代码,整数级,浮点3中心,浮点半径,整数大小){
//获取要处理的当前元素的索引
int i=获取全局id(0);
3pt;
点x=点[i];点y=点[size+i];点z=点[2*size+i];
uint res;
res=0;
新中心3号;
浮动半径;
如果(pt.x>center.x)res=setBit(res,0);
如果(pt.y>center.y)res=setBit(res,1);
如果(pt.z>center.z)res=setBit(res,2);
对于(int l=1;lnewCenter.y)res=setBit(res,3*l+1);
如果(pt.z>newCenter.z)res=setBit(res,3*l+2);
}
}
}
//保存结果
代码[i]=res;
}
要优化的规则:
另一件有趣的事。如果您是在3D级别操作,您可以使用float3变量并使用OpenCL操作符计算距离。这可以大大提高您的性能。但是也需要完全重写内核。当前的执行时间是多少,您希望代码的速度快多少,以及您是否希望代码平台特定?
__constant float3 boundsOffsetTable[8] = {
{-0.5,-0.5,-0.5},
{+0.5,-0.5,-0.5},
{-0.5,+0.5,-0.5},
{-0.5,-0.5,+0.5},
{+0.5,+0.5,-0.5},
{+0.5,-0.5,+0.5},
{-0.5,+0.5,+0.5},
{+0.5,+0.5,+0.5}
};
uint setBit(uint x,unsigned char position)
{
uint mask = 1<<position;
return x|mask;
}
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
code[i] = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) code = setBit(code,0);
if(pt.y>center.y) code = setBit(code,1);
if(pt.z>center.z) code = setBit(code,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) code = setBit(code,3*l);
if(pt.y>newCenter.y) code = setBit(code,3*l+1);
if(pt.z>newCenter.z) code = setBit(code,3*l+2);
}
}
}
}
__kernel void morton_code(__global float* point,__global uint*code,int level, float3 center,float radius,int size){
// Get the index of the current element to be processed
int i = get_global_id(0);
float3 pt;
pt.x = point[i];pt.y = point[size+i]; pt.z = point[2*size+i];
uint res;
res = 0;
float3 newCenter;
float newRadius;
if(pt.x>center.x) res = setBit(res,0);
if(pt.y>center.y) res = setBit(res,1);
if(pt.z>center.z) res = setBit(res,2);
for(int l = 1;l<level;l++)
{
for(int i=0;i<8;i++)
{
newRadius = radius *0.5;
newCenter = center + boundOffsetTable[i]*radius;
if(newCenter.x-newRadius<pt.x && newCenter.x+newRadius>pt.x && newCenter.y-newRadius<pt.y && newCenter.y+newRadius>pt.y && newCenter.z-newRadius<pt.z && newCenter.z+newRadius>pt.z)
{
if(pt.x>newCenter.x) res = setBit(res,3*l);
if(pt.y>newCenter.y) res = setBit(res,3*l+1);
if(pt.z>newCenter.z) res = setBit(res,3*l+2);
}
}
}
//Save the result
code[i] = res;
}