C uint64中38位的基数排序?
问题:是否有一种“拨号位”基数排序,可用于对数据类型中的位子集进行排序C uint64中38位的基数排序?,c,sorting,radix-sort,C,Sorting,Radix Sort,问题:是否有一种“拨号位”基数排序,可用于对数据类型中的位子集进行排序 uint64 *Radix_Sort_Dial_A_Bit(uint64_t *a, int num_a, int sort_bits); 特别是,是否可以对64位数据进行38位排序,并且速度介于下面所示的32/64和48/64之间 uint64 *Radix_Sort_ui64_38MSB(uint64_t *a, int num_a); 请注意,与在uint64_t[]中对所有64位进行排序相比,对48位和32位排序
uint64 *Radix_Sort_Dial_A_Bit(uint64_t *a, int num_a, int sort_bits);
特别是,是否可以对64位数据进行38位排序,并且速度介于下面所示的32/64和48/64之间
uint64 *Radix_Sort_ui64_38MSB(uint64_t *a, int num_a);
请注意,与在uint64_t[]中对所有64位进行排序相比,对48位和32位排序的研究验证了速度和正确性的提高
似乎根据数据包大小的子集进行排序的基数_Sort()通常是有用和有效的,只对需要的内容进行排序
在某些情况下,会为每个像素计算结果,并需要对结果进行排序。uint64_t[]用于保存计算结果和XY位置
保存像素XY坐标总共需要26位(X为13位,Y为13位,最大分辨率为8192),剩下38位用于数据排序
整个64位包可以使用基数\排序\ Uint64()进行排序
一种更快的方法是使用基数_Sort_Uint48()(见下文),因此排序中不考虑最后16位。这将正确地对所有数据进行排序,并对不需要的13个X坐标位中的10个进行排序
由于性能几乎与排序的位成线性比例,因此最佳情况下,排序中只考虑38个最高有效位
即使是40位基数排序也比使用48位要好。我试图将工作的48位基数排序推广到40位,但排序不正确
QSort_uint64_38_msb():
将64、48和32位结果线性化为38位:
lsf 64 0.938 48 0.736 32 0.534 38 -> 0.6500
38位基数排序比64位排序快35%,比48位排序快17%
即使是40位也会更快,每个uint64处理5个字节,而不是6个字节
=========
速度最快的uint64[]的8字节排序中的6字节,由以下内容概括:
//#############################################################################
//发件人:http://ideone.com/JHI0d9
//RadixSort---用于uint64的48 MSB
typedef联合{
结构{
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t计数[256*6];
}rscounts6_t;
// #############################################################################
//以基数排序64为模式,但只查看48个最大的整数位
//0XFFFF-FFFF-FFFF-0000>16)和0xff;
t5=(数组[x]>>24)&0xff;
t4=(数组[x]>>32)&0xff;
t3=(数组[x]>>40)&0xff;
t2=(数组[x]>>48)&0xff;
t1=(数组[x]>>56)&0xff;
计数。c6[t6]++;
计数。c5[t5]++;
计数。c4[t4]++;
计数。c3[t3]++;
计数。c2[t2]++;
计数.c1[t1]++;
}
//将计数转换为偏移量
对于(x=0;x<256;x++){
t6=o6+计数。c6[x];
t5=o5+计数。c5[x];
t4=o4+计数。c4[x];
t3=o3+计数。c3[x];
t2=o2+计数。c2[x];
t1=o1+计数。c1[x];
计数.c6[x]=o6;
计数c5[x]=o5;
计数.c4[x]=o4;
计数c3[x]=o3;
计数。c2[x]=o2;
计数c1[x]=o1;
o6=t6;
o5=t5;
o4=t4;
o3=t3;
o2=t2;
o1=t1;
}
//基数
对于(x=0;x>16)&0xff;
cpy[counts.c6[t6]]=数组[x];
计数.c6[t6]++;}
对于(x=0;x>24)和0xff;
数组[counts.c5[t5]]=cpy[x];
计数.c5[t5]++;}
对于(x=0;x>32)&0xff;
cpy[counts.c4[t4]=数组[x];
计数.c4[t4]++;}
对于(x=0;x>40)&0xff;
数组[counts.c3[t3]=cpy[x];
计数.c3[t3]++;}
对于(x=0;x>48)&0xff;
cpy[counts.c2[t2]=数组[x];
计数.c2[t2]++;}
对于(x=0;x>56)&0xff;
数组[counts.c1[t1]=cpy[x];
计数.c1[t1]+;}
免费(cpy);
返回数组;
}//结束基数\排序\ 48 \ msb()。
==================================
再次感谢Rcgldr的创新编程建议!
我没有使用10,10,9,9,而是使用[4][10]的快速32位模式
它可以工作,但比48 MSB排序慢很多,40 MSBt为737毫秒,48 MSB为588毫秒(
也许我编码得不好
Time= 6.108 sec = 33.668%, QSORT_UINT64_ARRAY , hits=1
Time= 3.060 sec = 16.866%, RADIX_SORT_UINT64_REG, hits=4, 0.765 sec each
Time= 2.947 sec = 16.241%, RADIX_SORT_UINT64_40R, hits=4, 0.737 sec each < SLOW
Time= 2.354 sec = 12.973%, RADIX_SORT_UINT64_48R, hits=4, 0.588 sec each
Time= 1.542 sec = 8.498%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
Time= 0.769 sec = 4.236%, RADIX_SORT_64 , hits=1
Time=6.108秒=33.668%,QSORT\u UINT64\u数组,命中率=1
时间=3.060秒=16.866%,基数排序UINT64注册,点击次数=4次,每次0.765秒
时间=2.947秒=16.241%,基数排序UINT64\U 40R,点击次数=4次,每次0.737秒<慢
时间=2.354秒=12.973%,基数排序UINT64秒48R,点击次数=4次,每次0.588秒
时间=1.542秒=8.498%,基数排序UINT64秒32R,点击次数=4次,每次0.385秒
时间=0.769秒=4.236%,基数排序64,点击次数=1
测试:
- 创建一个随机的uint64_t[36M]主阵列
- 使用qsort和已知良好的基数排序radixSort()对其进行排序,以创建标准数组
- 比较Qsort和radixSort()结果的一致性
- 使用32、40、48和64 MSB基数排序对主机副本进行排序
- 屏蔽忽略的LSB后,将每个测试排序与标准进行比较
//=============================================================================
// From code submitted by rcgldr, Feb 8 2020
// Optimized to use Registers and to sort on 40 MSBs, ignoring 24 LSBs
void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[4][1024] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 24) & 0x3ff]++;
mIndex[2][(u >> 34) & 0x3ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
}
} // End Radix_Sort_R64_40().
//=============================================================================
//根据rcgldr提交的代码,2020年2月8日
//优化为使用寄存器并在40个MSB上排序,忽略24个LSB
无效基数排序(uint64*pData、uint64*pTemp、大小计数、,
EV_时间街*tsa)
{
size_t mIndex[4][1024]={0};/*索引矩阵*/
矩阵行的大小\u t*pmIndex;/*ptr*/
尺寸i,j,m,n;
uint64_t u;
国际单项体育联合会(tsa)
// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort--- for 48 MSB of uint64
typedef union {
struct {
uint32_t c6[256];
uint32_t c5[256];
uint32_t c4[256];
uint32_t c3[256];
uint32_t c2[256];
uint32_t c1[256];
};
uint32_t counts[256 * 6];
} rscounts6_t;
// #############################################################################
// Patterned off of Radix_Sort_64 but looks only at the 48 MostSigBits
// 0XFFFF-FFFF-FFFF-0000 << Ignore the zeros, sort on 3 MostSigBytes
// Made for RGB48 stuffed into uint64 with 2 LeastSig bytes zero
// Get rid of the 7 and 8 level comps
uint64_t *radix_sort_48_msb(uint64_t *arrayA, uint32_t asize)
{
register uint64_t *array=arrayA; // Slam arg into Register!
register int ii; // Loop control
rscounts6_t counts;
memset(&counts, 0, 256 * 6 * sizeof(uint32_t));
uint64_t *cpy = (uint64_t *)malloc(asize * sizeof(uint64_t));
uint32_t o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
uint32_t t6, t5, t4, t3, t2, t1;
register uint32_t x;
// calculate counts
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
t5 = (array[x] >> 24) & 0xff;
t4 = (array[x] >> 32) & 0xff;
t3 = (array[x] >> 40) & 0xff;
t2 = (array[x] >> 48) & 0xff;
t1 = (array[x] >> 56) & 0xff;
counts.c6[t6]++;
counts.c5[t5]++;
counts.c4[t4]++;
counts.c3[t3]++;
counts.c2[t2]++;
counts.c1[t1]++;
}
// convert counts to offsets
for(x = 0; x < 256; x++) {
t6 = o6 + counts.c6[x];
t5 = o5 + counts.c5[x];
t4 = o4 + counts.c4[x];
t3 = o3 + counts.c3[x];
t2 = o2 + counts.c2[x];
t1 = o1 + counts.c1[x];
counts.c6[x] = o6;
counts.c5[x] = o5;
counts.c4[x] = o4;
counts.c3[x] = o3;
counts.c2[x] = o2;
counts.c1[x] = o1;
o6 = t6;
o5 = t5;
o4 = t4;
o3 = t3;
o2 = t2;
o1 = t1;
}
// radix
for(x = 0; x < asize; x++) {
t6 = (array[x] >> 16) & 0xff;
cpy[counts.c6[t6]] = array[x];
counts.c6[t6]++; }
for(x = 0; x < asize; x++) {
t5 = (cpy[x] >> 24) & 0xff;
array[counts.c5[t5]] = cpy[x];
counts.c5[t5]++; }
for(x = 0; x < asize; x++) {
t4 = (array[x] >> 32) & 0xff;
cpy[counts.c4[t4]] = array[x];
counts.c4[t4]++; }
for(x = 0; x < asize; x++) {
t3 = (cpy[x] >> 40) & 0xff;
array[counts.c3[t3]] = cpy[x];
counts.c3[t3]++; }
for(x = 0; x < asize; x++) {
t2 = (array[x] >> 48) & 0xff;
cpy[counts.c2[t2]] = array[x];
counts.c2[t2]++; }
for(x = 0; x < asize; x++) {
t1 = (cpy[x] >> 56) & 0xff;
array[counts.c1[t1]] = cpy[x];
counts.c1[t1]++; }
free(cpy);
return array;
} // End radix_sort_48_msb().
Time= 6.108 sec = 33.668%, QSORT_UINT64_ARRAY , hits=1
Time= 3.060 sec = 16.866%, RADIX_SORT_UINT64_REG, hits=4, 0.765 sec each
Time= 2.947 sec = 16.241%, RADIX_SORT_UINT64_40R, hits=4, 0.737 sec each < SLOW
Time= 2.354 sec = 12.973%, RADIX_SORT_UINT64_48R, hits=4, 0.588 sec each
Time= 1.542 sec = 8.498%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
Time= 0.769 sec = 4.236%, RADIX_SORT_64 , hits=1
//=============================================================================
// From code submitted by rcgldr, Feb 8 2020
// Optimized to use Registers and to sort on 40 MSBs, ignoring 24 LSBs
void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[4][1024] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 24) & 0x3ff]++;
mIndex[2][(u >> 34) & 0x3ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
}
} // End Radix_Sort_R64_40().
Unique lines from "~/tmp/radix.sort.32.c":
02) void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
05) size_t mIndex[4][256] = { 0 }; /* index matrix */
09) if(tsa) time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
13) mIndex[3][(u >> 32) & 0xff]++; // B4
14) mIndex[2][(u >> 40) & 0xff]++; // B5
15) mIndex[1][(u >> 48) & 0xff]++; // B6
16) mIndex[0][(u >> 56) & 0xff]++; // B7
22) for (i = 0; i < 256; i++) {
31) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
35) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
39) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
43) pData[mIndex[0][(u >> 56) & 0xff]++] = u;
Unique lines from "~/tmp/radix.sort.40.c":
01) void radix_sort_r64_40(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[4][1024] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_40R, tsa, E_TIME_EVENT, 1, 0);
12) mIndex[3][(u >> 24) & 0x3ff]++;
13) mIndex[2][(u >> 34) & 0x3ff]++;
14) mIndex[1][(u >> 44) & 0x3ff]++;
15) mIndex[0][(u >> 54) & 0x3ff]++;
21) for (i = 0; i < 1024; i++) {
30) pTemp[mIndex[3][(u >> 24) & 0x3ff]++] = u;
34) pData[mIndex[2][(u >> 34) & 0x3ff]++] = u;
38) pTemp[mIndex[1][(u >> 44) & 0x3ff]++] = u;
42) pData[mIndex[0][(u >> 54) & 0x3ff]++] = u;
void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[4][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[3][(u >> 26) & 0x1ff]++;
mIndex[2][(u >> 35) & 0x1ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 2; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 2; j < 4; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[3];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 54) & 0x3ff]++] = u;
}
}
void RadixSort3(uint64_t *, uint64_t *, size_t);
/* split array into 1024 bins according to most significant 10 bits */
void RadixSort(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t aIndex[1025] = {0}; /* index array */
uint32_t i, m, n;
for(i = 0; i < count; i++) /* generate histogram */
aIndex[(a[i] >> 54)]++;
n = 0; /* convert to indices */
for (i = 0; i < 1025; i++) {
m = aIndex[i];
aIndex[i] = n;
n += m;
}
for(i = 0; i < count; i++) /* sort by ms 10 bits */
b[aIndex[a[i]>>54]++] = a[i];
for(i = 1024; i; i--) /* restore aIndex */
aIndex[i] = aIndex[i-1];
aIndex[0] = 0;
for(i = 0; i < 1024; i++) /* radix sort the 1024 bins */
RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
}
void RadixSort3(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[3][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[2][(u >> 26) & 0x1ff]++;
mIndex[1][(u >> 35) & 0x1ff]++;
mIndex[0][(u >> 44) & 0x3ff]++;
}
for (j = 0; j < 1; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 1; j < 3; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
}
Time= 6.104 sec = 30.673%, QSORT_UINT64_ARRAY , hits=1
Time= 3.117 sec = 15.663%, RADIX_SORT_UINT64_REG, hits=4, 0.779 sec each
Time= 2.931 sec = 14.731%, RADIX_SORT_UINT64_40R, hits=4, 0.733 sec each
Time= 2.269 sec = 11.401%, RADIX_SORT_UINT64_48R, hits=4, 0.567 sec each
Time= 1.663 sec = 8.359%, RADIX_SORT_UINT64_36R, hits=4, 0.416 sec each < FAST
Time= 1.516 sec = 7.620%, RADIX_SORT_UINT64_32R, hits=4, 0.379 sec each
Time= 0.734 sec = 3.689%, RADIX_SORT_64 , hits=1
void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[6][64] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);
// 64 -- 56 48 40 32 24 16 -- 8 bits each
// 64 -- 58 52 46 40 34 28 -- 6 bits each
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i]; // Igonores Nibbles 0, 1 & 2
mIndex[5][(u >> 28) & 0x3F]++; // N2
mIndex[4][(u >> 34) & 0x3F]++; // N3
mIndex[3][(u >> 40) & 0x3F]++; // N4
mIndex[2][(u >> 46) & 0x3F]++; // N5
mIndex[1][(u >> 52) & 0x3F]++; // N6
mIndex[0][(u >> 58) & 0x3F]++; // N7
}
for (j = 0; j < 6; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 64; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
}
} // End Radix_Sort_R64_36().
Unique lines from "/home/brianp/tmp/radix.sort.36.c":
01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][64] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_36R, tsa, E_TIME_EVENT, 1, 0);
11) mIndex[5][(u >> 28) & 0x3F]++; // N2
12) mIndex[4][(u >> 34) & 0x3F]++; // N3
13) mIndex[3][(u >> 40) & 0x3F]++; // N4
14) mIndex[2][(u >> 46) & 0x3F]++; // N5
15) mIndex[1][(u >> 52) & 0x3F]++; // N6
16) mIndex[0][(u >> 58) & 0x3F]++; // N7
22) for (i = 0; i < 64; i++) {
31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
Unique lines from "/home/brianp/tmp/radix.sort.48.c":
01) void radix_sort_r64_48(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][256] = { 0 }; /* index matrix */
08) if(tsa) time_event(E_RADIX_SORT_UINT64_48R, tsa, E_TIME_EVENT, 1, 0);
14) mIndex[5][(u >> 16) & 0xff]++; // B2
15) mIndex[4][(u >> 24) & 0xff]++; // B3
16) mIndex[3][(u >> 32) & 0xff]++; // B4
17) mIndex[2][(u >> 40) & 0xff]++; // B5
18) mIndex[1][(u >> 48) & 0xff]++; // B6
19) mIndex[0][(u >> 56) & 0xff]++; // B7
25) for (i = 0; i < 256; i++) {
34) pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
38) pData[mIndex[4][(u >> 24) & 0xff]++] = u;
42) pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
46) pData[mIndex[2][(u >> 40) & 0xff]++] = u;
50) pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
54) pData[mIndex[0][(u >> 56) & 0xff]++] = u;
Time= 6.334 sec = 25.435%, QSORT_UINT64_ARRAY , hits=1
Time= 3.519 sec = 14.131%, RADIX_SORT_UINT64_REG, hits=4, 0.880 sec each
Time= 3.273 sec = 13.145%, RADIX_SORT_UINT64_40R, hits=4, 0.818 sec each < anomaly
Time= 2.680 sec = 10.764%, RADIX_SORT_UINT64_48R, hits=4, 0.670 sec each
Time= 2.302 sec = 9.246%, RADIX_SORT_UINT64_42R, hits=4, 0.576 sec each < NEW
Time= 2.025 sec = 8.132%, RADIX_SORT_UINT64_36R, hits=4, 0.506 sec each
Time= 1.767 sec = 7.094%, RADIX_SORT_UINT64_32R, hits=4, 0.442 sec each
Time= 0.955 sec = 3.835%, RADIX_SORT_64 , hits=1
void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[6][128] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_42R, tsa, E_TIME_EVENT, 1, 0);
// 64 -- 56 48 40 32 24 16 -- 8 bits each
// 64 -- 57 50 43 36 29 22 -- 7 bits each
// 64 -- 58 52 46 40 34 28 -- 6 bits each
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i]; // Igonores Nibbles 0, 1 & 2
mIndex[5][(u >> 22) & 0x7F]++; // N2
mIndex[4][(u >> 29) & 0x7F]++; // N3
mIndex[3][(u >> 36) & 0x7F]++; // N4
mIndex[2][(u >> 43) & 0x7F]++; // N5
mIndex[1][(u >> 50) & 0x7F]++; // N6
mIndex[0][(u >> 57) & 0x7F]++; // N7
}
for (j = 0; j < 6; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 128; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 57) & 0x7F]++] = u;
}
} // End Radix_Sort_R64_42().
Unique lines from "~/tmp/radix.sort.36.c":
01) void radix_sort_r64_36(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][64] = { 0 }; /* index matrix */
11) mIndex[5][(u >> 28) & 0x3F]++; // N2
12) mIndex[4][(u >> 34) & 0x3F]++; // N3
13) mIndex[3][(u >> 40) & 0x3F]++; // N4
14) mIndex[2][(u >> 46) & 0x3F]++; // N5
15) mIndex[1][(u >> 52) & 0x3F]++; // N6
16) mIndex[0][(u >> 58) & 0x3F]++; // N7
22) for (i = 0; i < 64; i++) {
31) pTemp[mIndex[5][(u >> 28) & 0x3F]++] = u;
35) pData[mIndex[4][(u >> 34) & 0x3F]++] = u;
39) pTemp[mIndex[3][(u >> 40) & 0x3F]++] = u;
43) pData[mIndex[2][(u >> 46) & 0x3F]++] = u;
47) pTemp[mIndex[1][(u >> 52) & 0x3F]++] = u;
51) pData[mIndex[0][(u >> 58) & 0x3F]++] = u;
19 Unique lines from "~/tmp/radix.sort.42.c":
01) void radix_sort_r64_42(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][128] = { 0 }; /* index matrix */
10) // 64 -- 56 48 40 32 24 16 -- 8 bits each
11) // 64 -- 57 50 43 36 29 22 -- 7 bits each
12) // 64 -- 58 52 46 40 34 28 -- 6 bits each
15) mIndex[5][(u >> 22) & 0x7F]++; // N2
16) mIndex[4][(u >> 29) & 0x7F]++; // N3
17) mIndex[3][(u >> 36) & 0x7F]++; // N4
18) mIndex[2][(u >> 43) & 0x7F]++; // N5
19) mIndex[1][(u >> 50) & 0x7F]++; // N6
20) mIndex[0][(u >> 57) & 0x7F]++; // N7
26) for (i = 0; i < 128; i++) {
35) pTemp[mIndex[5][(u >> 22) & 0x7F]++] = u;
39) pData[mIndex[4][(u >> 29) & 0x7F]++] = u;
43) pTemp[mIndex[3][(u >> 36) & 0x7F]++] = u;
47) pData[mIndex[2][(u >> 43) & 0x7F]++] = u;
51) pTemp[mIndex[1][(u >> 50) & 0x7F]++] = u;
55) pData[mIndex[0][(u >> 57) & 0x3F]++] = u;
Time= 6.208 sec = 21.838%, QSORT_UINT64_ARRAY , hits=1
Time= 3.358 sec = 11.813%, RADIX_SORT_UINT64_REG, hits=4, 0.840 sec each
Time= 2.525 sec = 8.884%, RADIX_SORT_UI64_AA99 , hits=4, 0.631 sec each <NEW
Time= 2.509 sec = 8.825%, RADIX_SORT_UINT64_48R, hits=4, 0.627 sec each
Time= 2.461 sec = 8.658%, RADIX_SORT_UI64_1024 , hits=4, 0.615 sec each <NEW
Time= 2.223 sec = 7.822%, RADIX_SORT_UINT64_42R, hits=4, 0.556 sec each
Time= 2.215 sec = 7.791%, RADIX_SORT_UI64_40_85, hits=4, 0.554 sec each
Time= 1.930 sec = 6.788%, RADIX_SORT_UINT64_36R, hits=4, 0.482 sec each
Time= 1.710 sec = 6.014%, RADIX_SORT_UINT64_32R, hits=4, 0.427 sec each
Time= 0.915 sec = 3.220%, COMP_UINT64_ARRAYS , hits=32, 0.029 sec each
Time= 6.156 sec = 23.199%, QSORT_UINT64_ARRAY , hits=1
Time= 2.993 sec = 11.277%, RADIX_SORT_UINT64_REG, hits=4, 0.748 sec each
Time= 2.409 sec = 9.077%, RADIX_SORT_UI64_AA99 , hits=4, 0.602 sec each < NEW
Time= 2.330 sec = 8.778%, RADIX_SORT_UI64_1024 , hits=4, 0.582 sec each < NEW
Time= 2.241 sec = 8.443%, RADIX_SORT_UINT64_48R, hits=4, 0.560 sec each
Time= 2.124 sec = 8.002%, RADIX_SORT_UI64_40_85, hits=4, 0.531 sec each
Time= 1.982 sec = 7.468%, RADIX_SORT_UINT64_42R, hits=4, 0.495 sec each
Time= 1.725 sec = 6.499%, RADIX_SORT_UINT64_36R, hits=4, 0.431 sec each
Time= 1.507 sec = 5.677%, RADIX_SORT_UINT64_32R, hits=4, 0.377 sec each
Time= 0.889 sec = 3.348%, COMP_UINT64_ARRAYS , hits=32, 0.028 sec each
gcc -Ofast -ffast-math -m64 -march=native -funroll-loops -fopenmp -flto -finline-functions -Wuninitialized ~/bin/pb.c -lm -o ~/bin/pb_a
// =============================================================================
// Sort with bin bits 10, 10, 9,9
// From code by rcgldr StackOverflow Feb 8, 2020
void RadixSort_aa99(uint64_t *a, uint64_t *b, size_t count, EV_TIME_STR *tsa)
{
uint32_t mIndex[4][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UI64_AA99, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[3][(u >> 26) & 0x1ff]++;
mIndex[2][(u >> 35) & 0x1ff]++;
mIndex[1][(u >> 44) & 0x3ff]++;
mIndex[0][(u >> 54) & 0x3ff]++;
}
for (j = 0; j < 2; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 2; j < 4; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[3];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 54) & 0x3ff]++] = u;
}
}
// =============================================================================
/* split array into 1024 bins according to most significant 10 bits */
void RadixSort_1024(uint64_t *a, uint64_t *b, size_t count, EV_TIME_STR *tsa)
{
uint32_t aIndex[1025] = {0}; /* index array */
uint32_t i, m, n;
if(tsa) time_event(E_RADIX_SORT_UI64_1024, tsa, E_TIME_EVENT, 1, 0);
for(i = 0; i < count; i++) /* generate histogram */
aIndex[(a[i] >> 54)]++;
n = 0; /* convert to indices */
for (i = 0; i < 1025; i++) {
m = aIndex[i];
aIndex[i] = n;
n += m;
}
for(i = 0; i < count; i++) /* sort by ms 10 bits */
b[aIndex[a[i]>>54]++] = a[i];
for(i = 1024; i; i--) /* restore aIndex */
aIndex[i] = aIndex[i-1];
aIndex[0] = 0;
for(i = 0; i < 1024; i++) /* radix sort the 1024 bins */
RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
}
void RadixSort3(uint64_t *a, uint64_t *b, size_t count)
{
uint32_t mIndex[3][1024] = { 0 }; /* index matrix */
uint32_t * pmIndex; /* ptr to row of matrix */
uint32_t i, j, m, n;
uint64_t u;
for (i = 0; i < count; i++) { /* generate histograms */
u = a[i];
mIndex[2][(u >> 26) & 0x1ff]++;
mIndex[1][(u >> 35) & 0x1ff]++;
mIndex[0][(u >> 44) & 0x3ff]++;
}
for (j = 0; j < 1; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 1024; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (j = 1; j < 3; j++) {
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 512; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
pmIndex = mIndex[2];
for (i = 0; i < count; i++) { /* radix sort */
u = a[i];
b[pmIndex[(u >> 26) & 0x1ff]++] = u;
}
pmIndex = mIndex[1];
for (i = 0; i < count; i++) {
u = b[i];
a[pmIndex[(u >> 35) & 0x1ff]++] = u;
}
pmIndex = mIndex[0];
for (i = 0; i < count; i++) {
u = a[i];
b[pmIndex[(u >> 44) & 0x3ff]++] = u;
}
}