OpenCL/尝试理解内核代码
我正在学习OpenCL代码,该代码模拟了以下教程中的N体问题: 我的主要问题在于内核代码:OpenCL/尝试理解内核代码,opencl,Opencl,我正在学习OpenCL代码,该代码模拟了以下教程中的N体问题: 我的主要问题在于内核代码: for(int jb=0; jb < nb; jb++) { /* Foreach block ... */ 19 pblock[ti] = pos_old[jb*nt+ti]; /* Cache ONE particle position */ 20 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in
for(int jb=0; jb < nb; jb++) { /* Foreach block ... */
19 pblock[ti] = pos_old[jb*nt+ti]; /* Cache ONE particle position */
20 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in the work-group */
21 for(int j=0; j<nt; j++) { /* For ALL cached particle positions ... */
22 float4 p2 = pblock[j]; /* Read a cached particle position */
23 float4 d = p2 - p;
24 float invr = rsqrt(d.x*d.x + d.y*d.y + d.z*d.z + eps);
25 float f = p2.w*invr*invr*invr;
26 a += f*d; /* Accumulate acceleration */
27 }
28 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in work-group */
29 }
__kernel
void
nbody_sim(
__global float4* pos ,
__global float4* vel,
int numBodies,
float deltaTime,
float epsSqr,
__local float4* localPos,
__global float4* newPosition,
__global float4* newVelocity)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
// Number of tiles we need to iterate
unsigned int numTiles = numBodies / localSize;
// position of this work-item
float4 myPos = pos[gid];
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < numTiles; ++i)
{
// load one tile into local memory
int idx = i * localSize + tid;
localPos[tid] = pos[idx];
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// calculate acceleration effect due to each body
// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
for(int j = 0; j < localSize; ++j)
{
// Calculate acceleartion caused by particle j on particle i
float4 r = localPos[j] - myPos;
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
float invDist = 1.0f / sqrt(distSqr + epsSqr);
float invDistCube = invDist * invDist * invDist;
float s = localPos[j].w * invDistCube;
// accumulate effect of all particles
acc += s * r;
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
float4 oldVel = vel[gid];
// updated position and velocity
float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
newPos.w = myPos.w;
float4 newVel = oldVel + acc * deltaTime;
// write to global memory
newPosition[gid] = newPos;
newVelocity[gid] = newVel;
}
for(int jb=0;jb 21对于(int j=0;j你能发布整个内核代码吗?我必须假设参数和私有变量
组中似乎有nt个工作项,ti表示当前工作项。循环执行时,组中的每个项将只复制单个元素。通常,此副本来自全局数据源。第一个屏障强制工作项等待其他项完成其副本。这是必要的,因为组中的每个工作项都需要读取从每个其他工作项复制的数据。值不应该相同,因为每个工作项的ti应该不同。(但是对于第一个循环,jb*nt仍然等于零)
以下是整个内核代码:
for(int jb=0; jb < nb; jb++) { /* Foreach block ... */
19 pblock[ti] = pos_old[jb*nt+ti]; /* Cache ONE particle position */
20 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in the work-group */
21 for(int j=0; j<nt; j++) { /* For ALL cached particle positions ... */
22 float4 p2 = pblock[j]; /* Read a cached particle position */
23 float4 d = p2 - p;
24 float invr = rsqrt(d.x*d.x + d.y*d.y + d.z*d.z + eps);
25 float f = p2.w*invr*invr*invr;
26 a += f*d; /* Accumulate acceleration */
27 }
28 barrier(CLK_LOCAL_MEM_FENCE); /* Wait for others in work-group */
29 }
__kernel
void
nbody_sim(
__global float4* pos ,
__global float4* vel,
int numBodies,
float deltaTime,
float epsSqr,
__local float4* localPos,
__global float4* newPosition,
__global float4* newVelocity)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int localSize = get_local_size(0);
// Number of tiles we need to iterate
unsigned int numTiles = numBodies / localSize;
// position of this work-item
float4 myPos = pos[gid];
float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
for(int i = 0; i < numTiles; ++i)
{
// load one tile into local memory
int idx = i * localSize + tid;
localPos[tid] = pos[idx];
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
// calculate acceleration effect due to each body
// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
for(int j = 0; j < localSize; ++j)
{
// Calculate acceleartion caused by particle j on particle i
float4 r = localPos[j] - myPos;
float distSqr = r.x * r.x + r.y * r.y + r.z * r.z;
float invDist = 1.0f / sqrt(distSqr + epsSqr);
float invDistCube = invDist * invDist * invDist;
float s = localPos[j].w * invDistCube;
// accumulate effect of all particles
acc += s * r;
}
// Synchronize so that next tile can be loaded
barrier(CLK_LOCAL_MEM_FENCE);
}
float4 oldVel = vel[gid];
// updated position and velocity
float4 newPos = myPos + oldVel * deltaTime + acc * 0.5f * deltaTime * deltaTime;
newPos.w = myPos.w;
float4 newVel = oldVel + acc * deltaTime;
// write to global memory
newPosition[gid] = newPos;
newVelocity[gid] = newVel;
}
\u内核
无效的
身体模拟(
__全球浮动4*pos,
__全球浮动4*vel,
内特·努伯迪,
浮三角洲,
浮点数,
__本地float4*localPos,
__全球浮动4*newPosition,
__全局浮动(4*newVelocity)
{
unsigned int tid=get\u local\u id(0);
unsigned int gid=get_global_id(0);
unsigned int localSize=get\u local\u size(0);
//我们需要迭代的分片数
unsigned int numTiles=numBodies/localSize;
//此工作项的位置
float4 myPos=pos[gid];
浮动4 acc=(浮动4)(0.0f、0.0f、0.0f、0.0f);
对于(int i=0;ij]=m[j]*r[i->j]/(r^2+epsSqr)^(3/2)
对于(int j=0;j
对于每个工作组,都有带有“localSize”工作项的“numTiles”工作组
“gid”是全局索引,“tid”是局部索引
让我们从循环“for(int i=0;i
例如:
numTiles=4,localSize=25,numBodies=100=工作项的数量
然后,在执行时,如果gid=80,那么tid=5,idx=5,第一个赋值将是:localPos[5]=pos[5]
现在,我取gid=5,然后取tid=5和idx=5,我将有相同的赋值:localPos[5]=pos[5]
因此,据我所知,在第一次迭代中,在第一个“屏障”之后,每个工作项都包含相同的局部数组“localPos”,即第一个全局块的子数组,即“pos[0:24]”
这是一个很好的解释发生了什么