对uint64进行排序的基数排序C代码是否仅查看32 MSB位?

对uint64进行排序的基数排序C代码是否仅查看32 MSB位?,c,performance,sorting,C,Performance,Sorting,我使用路易斯·里奇(Louis Ricci)提供的uint64_t基数排序(于2015年8月24日18:00回答)。惊人的快 我有一个包含2个uint32_t项的数据结构,我想对一个大数组(2000多万)进行排序,只查看第一个或最后一个32位,但我希望排序例程将整个64位包作为一个单元移动 是否有一种C语言uint64基数排序,它基于整个64位量子的子集进行排序,就像数据被0x1111100000000屏蔽一样?示例C代码。它使用的局部变量比原始文章中链接的示例少,允许C编译器为这些变量使用寄存

我使用路易斯·里奇(Louis Ricci)提供的uint64_t基数排序(于2015年8月24日18:00回答)。惊人的快

我有一个包含2个uint32_t项的数据结构,我想对一个大数组(2000多万)进行排序,只查看第一个或最后一个32位,但我希望排序例程将整个64位包作为一个单元移动


是否有一种C语言uint64基数排序,它基于整个64位量子的子集进行排序,就像数据被0x1111100000000屏蔽一样?

示例C代码。它使用的局部变量比原始文章中链接的示例少,允许C编译器为这些变量使用寄存器。在我的系统(英特尔3770K 3.5ghz cpu,Windows 7 Pro 64位)上,此程序用不到0.5秒的时间将2000万(20*1024*1024=20971520)64位无符号整数按高32位排序

/*通过64位无符号整数的高32位进行基数排序*/
#包括
#包括
#包括
typedef无符号长uint64\u t;
void RadixSort(uint64\u t*pData、uint64\u t*pTemp、size\u t count)
{
size_t mIndex[4][256]={0};/*索引矩阵*/
矩阵行的大小\u t*pmIndex;/*ptr*/
尺寸i,j,m,n;
uint64_t u;
对于(i=0;i>32)&0xff]+;
mIndex[2][(u>>40)&0xff]+;
mIndex[1][(u>>48)&0xff]+;
mIndex[0][(u>>56)&0xff]+;
}
对于(j=0;j<4;j++){/*转换为索引*/
pmIndex=mIndex[j];
n=0;
对于(i=0;i<256;i++){
m=pmIndex[i];
pmIndex[i]=n;
n+=m;
}
}
对于(i=0;i>32)&0xff]+]=u;
}
对于(i=0;i>40)&0xff]+]=u;
}
对于(i=0;i>48)&0xff]+]=u;
}
对于(i=0;i>56)&0xff]+]=u;
}
}
#定义计数(20*1024*1024)/*元素数*/
静态时钟\u t dwTimeStart;/*时钟值*/
静态时钟;
int main()
{
uint64_t*数据;
uint64_t*pTemp;
uint64_t r;
尺寸i;
/*分配内存*/
pData=(uint64_t*)malloc(COUNT*sizeof(uint64_t));
if(pData==NULL){
返回0;
}
pTemp=(uint64_t*)malloc(COUNT*sizeof(uint64_t));
if(pTemp==NULL){
免费(pData);
返回0;
}
对于(i=0;i>4)和0xff))>4)和0xff)4)和0xff)4)和0xff)4)和0xff)4)和0xff)32)>(pData[i]>>32){
打破
}
}
如果(i!=计数)
printf(“排序错误\n”);
免费(pData);
返回(0);
}

示例C代码。它使用的局部变量比原始文章中链接的示例少,允许C编译器为这些变量使用寄存器。在我的系统(英特尔3770K 3.5ghz cpu,Windows 7 Pro 64位)上,此程序用不到0.5秒的时间将2000万(20*1024*1024=20971520)64位无符号整数按高32位排序

/*通过64位无符号整数的高32位进行基数排序*/
#包括
#包括
#包括
typedef无符号长uint64\u t;
void RadixSort(uint64\u t*pData、uint64\u t*pTemp、size\u t count)
{
size_t mIndex[4][256]={0};/*索引矩阵*/
矩阵行的大小\u t*pmIndex;/*ptr*/
尺寸i,j,m,n;
uint64_t u;
对于(i=0;i>32)&0xff]+;
mIndex[2][(u>>40)&0xff]+;
mIndex[1][(u>>48)&0xff]+;
mIndex[0][(u>>56)&0xff]+;
}
对于(j=0;j<4;j++){/*转换为索引*/
pmIndex=mIndex[j];
n=0;
对于(i=0;i<256;i++){
m=pmIndex[i];
pmIndex[i]=n;
n+=m;
}
}
对于(i=0;i>32)&0xff]+]=u;
}
对于(i=0;i>40)&0xff]+]=u;
}
对于(i=0;i>48)&0xff]+]=u;
}
对于(i=0;i>56)&0xff]+]=u;
}
}
#定义计数(20*1024*1024)/*元素数*/
静态时钟\u t dwTimeStart;/*时钟值*/
静态时钟;
int main()
{
uint64_t*数据;
uint64_t*pTemp;
uint64_t r;
尺寸i;
/*分配内存*/
pData=(uint64_t*)malloc(COUNT*sizeof(uint64_t));
if(pData==NULL){
返回0;
}
pTemp=(uint64_t*)malloc(COUNT*sizeof(uint64_t));
if(pTemp==NULL){
免费(pData);
返回0;
}
对于(i=0;i>4)和0xff))>4)和0xff)4)和0xff)4)和0xff)4)和0xff)4)和0xff)32)>(pData[i]>>32){
打破
}
}
如果(i!=计数)
printf(“排序错误\n”);
免费(pData);
返回(0);
}
//=============================================================================
//创建2个相同的uint64[]s,镜像最后一个中的前32位,对其进行排序
//使用qsort,另一个使用RadixSort11110000()并比较结果
内部测试64 32排序(EV时间STR*tsa){
uint64_t*la1,*la2;
int ii,lcount=3615232,zmem=0,debug=0,ucount=1,ri,er12=0,ep=63;
时间;//这次!
float rssec=2.0f;//RadixSo
/*  radix sort via upper 32 bits of 64 bit unsigned integers */

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef unsigned long long uint64_t;

void RadixSort(uint64_t * pData, uint64_t * pTemp, size_t count)
{
    size_t mIndex[4][256] = { 0 };      /* index matrix */
    size_t * pmIndex;                   /* ptr to row of matrix */
    size_t i, j, m, n;
    uint64_t u;

    for (i = 0; i < count; i++) {       /* generate histograms */
        u = pData[i];
        mIndex[3][(u >> 32) & 0xff]++;
        mIndex[2][(u >> 40) & 0xff]++;
        mIndex[1][(u >> 48) & 0xff]++;
        mIndex[0][(u >> 56) & 0xff]++;
    }

    for (j = 0; j < 4; j++) {           /* convert to indices */
        pmIndex = mIndex[j];
        n = 0;
        for (i = 0; i < 256; i++) {
            m = pmIndex[i];
            pmIndex[i] = n;
            n += m;
        }
    }

    for (i = 0; i < count; i++) {       /* radix sort */
        u = pData[i];
        pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[2][(u >> 40) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pData[i];
        pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
    }
    for (i = 0; i < count; i++) {
        u = pTemp[i];
        pData[mIndex[0][(u >> 56) & 0xff]++] = u;
    }
}

#define COUNT (20*1024*1024)            /* number of elements */

static clock_t dwTimeStart;             /* clock values */
static clock_t dwTimeStop;

int main( )
{
uint64_t * pData;
uint64_t * pTemp;
uint64_t r;
size_t i;

    /* allocate memory */
    pData  = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
    if(pData == NULL){
        return 0;
    }
    pTemp  = (uint64_t *)malloc(COUNT*sizeof(uint64_t));
    if(pTemp == NULL){
        free(pData);
        return 0;
    }

    for(i = 0; i < COUNT; i++){         /* generate test data */
        r  = (((uint64_t)((rand()>>4) & 0xff))<< 0);
        r |= (((uint64_t)((rand()>>4) & 0xff))<< 8);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<16);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<24);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<32);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<40);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<48);
        r |= (((uint64_t)((rand()>>4) & 0xff))<<56);
        pData[i] = r;
    }

    dwTimeStart = clock();
    RadixSort(pData, pTemp, COUNT);     /* sort array */
    dwTimeStop = clock();
    printf("Number of ticks %d\n", dwTimeStop-dwTimeStart);
    for(i = 1; i < COUNT; i++){         /* check sort */
        if((pData[i-1]>>32) > (pData[i]>>32)){
            break;
        }
    }
    if(i != COUNT)
        printf("sort error\n");
    free(pData);
    return(0);
}
// =============================================================================
// Create 2 identical uint64[]s, mirror the first 32 bits in the last, sort one
// with qsort and the other with RadixSort11110000() and compare the results
int test_uint64_32_sort(EV_TIME_STR *tsa)  {
uint64_t *la1, *la2;    
int ii, lcount=3615232, zmem=0, debug=0, ucount=1, ri, er12=0, ep=63;
time_t tt;  // This Time! 
float rssec=2.0f;  // RadixSort_SEConds elapsed time from Radixsort()
    time_event(E_GENERATE_RANDOM_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    srand((unsigned) time(&tt));
    la1=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_1", debug);
    la2=(uint64_t *)alloc_check((lcount)*8, zmem=0, "LLong_ara_2", debug);

    for(ii=0; ii < lcount; ii++)  {  // Look only SHORT range
        ri = rand();  // Random int
        la1[ii] = ri << 32 + (0XFFFFFFFF - ri);  // Reflect val in lower 32
    }

    // Make identical copies in the other []
    time_event(E_MEMCPY_DOUBLE_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    memcpy((void *) la2, (void *) la1, lcount * sizeof(uint64_t));

    time_event(E_QSORT_UINT64_ARRAY, tsa, E_TIME_EVENT, 1, 0); 
    qsort(la1, lcount, sizeof(uint64_t), comp_uint6411110000);

    time_event(E_RADIX_SORT64_11110000, tsa, E_TIME_EVENT, 1, 0); 
    radixSort11110000(la2, lcount, &rssec, &ucount);

    time_event(E_SPLIT_RGBJ_MEM,   tsa, E_TIME_EVENT, 1, 0); 
    for(ii=er12=0; ii < lcount; ii++)  { 
        if(la1[ii] != la2[ii])  {  
            er12++;  
            if((--ep) > 0)  {
                printf("II %d) Er%d, l1=0X%016llX, l2=0X%016llX\n", 
                    ii, ep, la1[ii], la2[ii]);  FF_SO;  }
        }  // Count Error Mismatches
        if(!(ii%100000))  {  
            printf("II %d) l1=0X%016llX, l2=0X%016llX\n", 
                ii, la1[ii], la2[ii]);  FF_SO;  }
    }
    printf("T63S: Er1/2 = %d \n", er12);  FF_SO;
    if(ucount)  {
        printf("T63S: RadixSort time = %.3f ms, unique=%d = %.3f%%\n",
            rssec*1.0E3f, ucount, (100.0f * ucount / lcount));  FF_SO;
    }
    free_bb(la1);  free_bb(la2); 
}



// =============================================================================
// Based on original code from https://ideone.com/JHI0d9
// and suggestions from  Ian Abbott
// https://stackoverflow.com/questions/47080353/radix-sort-c-code-to-sort-uint64-t-looking-only-at-32-msb-bits
// Hacked code may be unsuitable for any use and should not be used by anyone
// Sort uint64[] by looking ONLY at the 
uint64_t *radixSort11110000(uint64_t *arrayA, uint32_t size)  {
register uint64_t *array=arrayA;  // Slam arg into Register!
register uint64_t std;  // STanDard to compare others for uniqueness
register int dist=0;  // Distinct, unique values found if *UCount arg is TRUE
register int ii;  // Loop control
int64_t rtime, mtns;  // Time in NanoSeconds!!!
    rscounts4_t counts;
    memset(&counts, 0, 256 * 4 * sizeof(uint32_t));
    uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
    uint32_t o4=0, o3=0, o2=0, o1=0;
    uint32_t t4, t3, t2, t1;
    register uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        t3 = (array[x] >> 40) & 0xff;
        t2 = (array[x] >> 48) & 0xff;
        t1 = (array[x] >> 56) & 0xff;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;  }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 40) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;  }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 48) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;  }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 56) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;  }
    free(cpy);
    return array;
}  // End radixSort_11110000().


// #############################################################################
// From: http://ideone.com/JHI0d9
// RadixSort---
typedef union {
    struct {
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 4];
}  rscounts4_t;


// =============================================================================
// Compare only the MSB 4 bytes of a uint64 by masking each with 
// 0XFFFFFFFF00000000 before comparison
int comp_uint6411110000(const void *a, const void *b)  {
    return (
      ( 
        ( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) > 
        ( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL ) 
      )
     - 
      (  
        ( ( *( (uint64_t *)a ) ) & 0XFFFFFFFF00000000ULL ) < 
        ( ( *( (uint64_t *)b ) ) & 0XFFFFFFFF00000000ULL )
      )
    );
}  // End Comp_Uint64_11110000().



// Both sorted arrays were identical. 
// T63S: Er1/2 = 0
// TE: Top 90% events in desc order (7/312):
//  Time=  0.157 sec = 71.282%, QSORT_UINT64_ARRAY   , hits=1
//  Time=  0.029 sec = 13.119%, RADIX_SORT64_11110000, hits=1
//  Time=  0.026 sec = 11.872%, GENERATE_RANDOM_ARRAY, hits=1

// mult 71.282  /13.119  -> 5.433493
// The RadixSort is over 5x faster that the qsort

Perhaps the QSort comparitor could be handled more efficiently
  Time= 1.851 sec = 15.648%, RADIX_SORT_FFFF0000  , hits=4, 0.463 sec each  << OLD
  Time= 1.552 sec = 13.120%, RADIX_SORT_UINT64_32R, hits=4, 0.388 sec each  << NEW
Time=  6.125 sec = 62.359%, QSORT_UINT64_ARRAY   , hits=1
Time=  0.832 sec =  8.468%, RADIX_SORT_64        , hits=1  << OLD
Time=  0.770 sec =  7.842%, RADIX_SORT_UINT64_REG, hits=1  << NEW
Time=  3.130 sec = 20.342%, RADIX_SORT_UINT64_REG, hits=4, 0.782 sec each
Time=  2.336 sec = 15.180%, RADIX_SORT_UINT64_48R, hits=4, 0.584 sec each
Time=  1.540 sec = 10.007%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each