C++ 键/值数组的双音排序
我正在尝试修改对C++ 键/值数组的双音排序,c++,arrays,algorithm,sorting,opencl,C++,Arrays,Algorithm,Sorting,Opencl,我正在尝试修改对cl\u ints数组进行排序的算法,以对cl\u int2s数组进行排序(基于键–即cl\u int2.x) Intel的示例由一个简单的主机代码和一个OpenCL内核组成,该内核在一次排序操作(多路径)期间被多次调用。 内核一次加载4个数组项作为cl_int4,并对它们进行操作 我没有修改主机代码算法,只有设备代码。内核函数中的更改列表: 将第一个内核的参数类型从int4*修改为int8*(以加载四个键值对) 仅使用。甚至数组元素的组件也可以比较值(我终于解决了这个问题 棘
cl\u int
s数组进行排序的算法,以对cl\u int2
s数组进行排序(基于键–即cl\u int2.x
)
Intel的示例由一个简单的主机代码和一个OpenCL内核组成,该内核在一次排序操作(多路径)期间被多次调用。
内核一次加载4个数组项作为cl_int4
,并对它们进行操作
我没有修改主机代码算法,只有设备代码。内核函数中的更改列表:
- 将第一个内核的参数类型从
修改为int4*
(以加载四个键值对)int8*
- 仅使用
。甚至
数组
元素的组件也可以比较值(
我终于解决了这个问题 棘手的部分在于原始Intel代码处理加载的4元组中相邻对的相等值的方式—它没有显式地处理它 错误出现在第一个
和最后一个阶段
(即passOfStage
)的每一个passOfStage=0
中。这些代码部分在一个4元组中交换单个2元组(由阶段
数组cl\u int8
数组表示)
让我们考虑这个节录(例如,对于4元组中的相等相邻的2元组,它不能正常工作):
固定的、功能齐全的版本(我已经对固定部分进行了注释):
int4;你是说int8 psuedomask吗?@SamerTufail我想它应该是\uuuuu内核无效位排序(\uuuu全局int8*数组, 警察阶段, 警察局, 警察局长) { size\u t i=获取全局\u id(0); int8 srcleet,srclright,mask; int4伪任务; int4imask10=(int4)(0,0,-1,-1); int4imask11=(int4)(0,-1,0,-1); 如果(阶段>0) { 如果(passOfStage>0)//上层通行证,则在四人之间交换 { 尺寸(r=1>(第1阶段))>(第1阶段))&1)^dir) { 阵列[左]=imin; 阵列[右]=imax; } 其他的 { 数组[右]=imin; 阵列[左]=imax; } } 否则//最后一次通过,在1-4内排序 { srcleet=阵列[i]; srcRight=srcleeft.s45670123; 伪任务=(srcleet.偶
>阶段)&1)^dir) { srcleet=(srcleet&mask)|(srclright&mask); srcRight=srcleeft.s23016745; 伪任务=(srcleet.偶
,因为我在其中存储int4
值(例如int4
–pseudoTask=srcleet.偶
srcleet.偶
是
)。此外,如果我尝试将类型更改为int4
,代码将无法编译int8@SamerTufail没什么可道歉的:)我很高兴有人看完了我的问题!int8
// Copyright (c) 2009-2011 Intel Corporation // https://software.intel.com/en-us/articles/bitonic-sorting // Modified to sort int2 key-value array __kernel void BitonicSort(__global int8* theArray, const uint stage, const uint passOfStage, const uint dir) { size_t i = get_global_id(0); int8 srcLeft, srcRight, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passOfStage > 0) // upper level pass, exchange between two fours, { size_t r = 1 << (passOfStage - 1); size_t lmask = r - 1; size_t left = ((i>>(passOfStage-1)) << passOfStage) + (i & lmask); size_t right = left + r; srcLeft = theArray[left]; srcRight = theArray[right]; pseudomask = srcLeft.even < srcRight.even; mask = pseudomask.xxyyzzww; int8 imin = (srcLeft & mask) | (srcRight & ~mask); int8 imax = (srcLeft & ~mask) | (srcRight & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { theArray[left] = imin; theArray[right] = imax; } else { theArray[right] = imin; theArray[left] = imax; } } else // last pass, sort inside one four { srcLeft = theArray[i]; srcRight = srcLeft.s45670123; pseudomask = (srcLeft.even < srcRight.even) ^ imask10; mask = pseudomask.xxyyzzww; if(((i >> stage) & 1) ^ dir) { srcLeft = (srcLeft & mask) | (srcRight & ~mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxyyzzww; theArray[i] = (srcLeft & mask) | (srcRight & ~mask); } else { srcLeft = (srcLeft & ~mask) | (srcRight & mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxyyzzww; theArray[i] = (srcLeft & ~mask) | (srcRight & mask); } } } else // first stage, sort inside one four { /* * To convert this code to int2 sorter, do this: * 1. instead of loading int4, load int8 (key,value, key,value, ...) * 2. when there is a vector swizzling, replace component index with two consecutive indices: * srcLeft.yxwz -> srcLeft.s23016745 * use this rewrite rule: * x y z w * 01 23 45 67 * 3. replace comparison operands with only their keys swizzled: * mask = srcLeft < srcRight; -> pseudomask = srcLeft.even < srcRight.even; mask = pseudomask.xxyyzzww; */ // make bitonic sequence out of 4. int4 imask0 = (int4)(0, -1, -1, 0); // -1 in comparison = true (all bits set - two's complement) srcLeft = theArray[i]; srcRight = srcLeft.s23016745; /* * This XOR mask flips bits, so that in `mask` are the following * results (remember that srcRight is srcLeft with swapped component pairs): * * [ left.x<left.y, left.x<left.y, left.w<left.z, left.w<left.z ] * or: [ left.x<left.y, left.x<left.y, left.z>left.w, left.z>left.w ] */ pseudomask = (srcLeft.even < srcRight.even) ^ imask0; mask = pseudomask.xxyyzzww; if( dir ) srcLeft = (srcLeft & mask) | (srcRight & ~mask); // make sure the numbers are sorted like this: else srcLeft = (srcLeft & ~mask) | (srcRight & mask); /* * Now the pairs of numbers in `srcLeft` are sorted according to the specified `dir`ection. * If dir == true, then * The components `x` and `y` are swapped so that `x` < `y`. Moreover `z` and `w` are swapped so that `z` > `w`. This resembles up-hill: /\ * else * The components `x` and `y` are swapped so that `x` > `y`. Moreover `z` and `w` are swapped so that `z` < `w`. This resembles down-hill: \/ * * This swapping is achieved by creating `srcLeft`, which is in normal order, and `srcRight`, which has component pairs switched (xyzw -> yxwz). * Then the `mask` is created. The mask bits are redundant because it applies to vector component pairs (so in order to implement key-value sorting, * I have to increase the length of masks!). * * The non-ordered component pairs in `srcLeft` are masked out by `mask` while the inverted `mask` is applied to the (pair-wise switched) `srcRight`. * * This (the previous) first flipping just makes a 4-bitonic sequence. */ /* * This second step just sorts the bitonic sequence */ srcRight = srcLeft.s45670123; // inverts the bitonic sequence // [ left.a<left.c, left.b<left.d, left.a<left.c, left.b<left.d ] pseudomask = (srcLeft.even < srcRight.even) ^ imask10; // imask10 = (noflip, noflip, flip, flip) mask = pseudomask.xxyyzzww; // even or odd (The output of this thread is sorted monotonic sequence. The monotonicity changes and thus preparing bitonic sequence for the next pass.). if((i & 1) ^ dir) { // this sorts the bitonic sequence, hence splitting it srcLeft = (srcLeft & mask) | (srcRight & ~mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxyyzzww; theArray[i] = (srcLeft & mask) | (srcRight & ~mask); } else { srcLeft = (srcLeft & ~mask) | (srcRight & mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxyyzzww; theArray[i] = (srcLeft & ~mask) | (srcRight & mask); } } }
void ExecuteSortKernel(cl_kernel kernel, cl_command_queue queue, cl_mem cl_input_buffer, cl_int arraySize, cl_uint sortAscending) { cl_int numStages = 0; cl_int stage; cl_int passOfStage; for (cl_int temp = arraySize; temp > 2; temp >>= 1) numStages++; clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &cl_input_buffer); clSetKernelArg(kernel, 3, sizeof(cl_uint), (void *) &sortAscending); for (stage = 0; stage < numStages; stage++) { clSetKernelArg(kernel, 1, sizeof(cl_uint), (void *) &stage); for (passOfStage = stage; passOfStage >= 0; passOfStage--) { clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &passOfStage); // set work-item dimensions size_t gsz = arraySize / (2*4); size_t global_work_size[1] = { passOfStage ? gsz : gsz << 1 }; //number of quad items in input array // execute kernel clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL); } } }
imask0 = (int4)(0, -1, -1, 0); srcLeft = theArray[i]; // int8 srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask0; mask = pseudomask.xxyyzzww; result = (srcLeft & mask) | (srcRight & ~mask);
srcLeft: x y z w < < < < srcRight [relative to srcLeft]: y x w z ^ imask0: 0 -1 0 1 ------------------------------------------ (srcLeft<srcRight)^imask0: x x z z
__kernel void BitonicSort(__global int8* theArray, const uint stage, const uint passOfStage, const uint dir) { size_t i = get_global_id(0); int8 srcLeft, srcRight, mask; int4 pseudomask; int4 imask10 = (int4)(0, 0, -1, -1); int4 imask11 = (int4)(0, -1, 0, -1); if(stage > 0) { if(passOfStage > 0) // upper level pass, exchange between two fours { size_t r = 1 << (passOfStage - 1); size_t lmask = r - 1; size_t left = ((i>>(passOfStage-1)) << passOfStage) + (i & lmask); size_t right = left + r; srcLeft = theArray[left]; srcRight = theArray[right]; pseudomask = srcLeft.even < srcRight.even; mask = pseudomask.xxyyzzww; // here we interchange individual components, so no mask is applied and hence no 2 pairs must contain the same bit-pattern int8 imin = (srcLeft & mask) | (srcRight & ~mask); int8 imax = (srcLeft & ~mask) | (srcRight & mask); if( ((i>>(stage-1)) & 1) ^ dir ) { theArray[left] = imin; theArray[right] = imax; } else { theArray[right] = imin; theArray[left] = imax; } } else // last pass, sort inside one four { srcLeft = theArray[i]; srcRight = srcLeft.s45670123; pseudomask = (srcLeft.even < srcRight.even) ^ imask10; mask = pseudomask.xxyyxxyy; if(((i >> stage) & 1) ^ dir) { srcLeft = (srcLeft & mask) | (srcRight & ~mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd) theArray[i] = (srcLeft & mask) | (srcRight & ~mask); } else { srcLeft = (srcLeft & ~mask) | (srcRight & mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd) theArray[i] = (srcLeft & ~mask) | (srcRight & mask); } } } else // first stage, sort inside one four { int4 imask0 = (int4)(0, -1, -1, 0); srcLeft = theArray[i]; srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask0; mask = pseudomask.xxxxwwww; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd) if( dir ) srcLeft = (srcLeft & mask) | (srcRight & ~mask); else srcLeft = (srcLeft & ~mask) | (srcRight & mask); srcRight = srcLeft.s45670123; pseudomask = (srcLeft.even < srcRight.even) ^ imask10; mask = pseudomask.xxyyxxyy; // the 0th and 2nd elements must contain the exact same value (as well as 1st and 3rd) if((i & 1) ^ dir) { srcLeft = (srcLeft & mask) | (srcRight & ~mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd) theArray[i] = (srcLeft & mask) | (srcRight & ~mask); } else { srcLeft = (srcLeft & ~mask) | (srcRight & mask); srcRight = srcLeft.s23016745; pseudomask = (srcLeft.even < srcRight.even) ^ imask11; mask = pseudomask.xxxxzzzz; // the 0th and 1st elements must contain the exact same value (as well as 2nd and 3rd) theArray[i] = (srcLeft & ~mask) | (srcRight & mask); } } }