C 查找已知整数键集

C 查找已知整数键集,c,algorithm,C,Algorithm,在我的环境中,Gperf的性能一直低于Judy数组,我想知道是否还有另外一个专门为整数键构建的完美哈希库。我事先就知道这组关键点,我想利用这一点实现性能/尺寸优势 共有约1000个密钥,检索不是按顺序进行的。密钥对都是整数。键是32位的,检索到的值是8位的。规模是最重要的因素 如果有一种方法可以调整整数键的Gperf,或者仅仅是另一种方法,我也洗耳恭听。:) (旁注:……当我输入这个问题时,我意识到二进制搜索可能会占上风,而我刚刚对这个问题进行了仔细思考。不过,为了学习,我还是想听听你的想法!)

在我的环境中,Gperf的性能一直低于Judy数组,我想知道是否还有另外一个专门为整数键构建的完美哈希库。我事先就知道这组关键点,我想利用这一点实现性能/尺寸优势

共有约1000个密钥,检索不是按顺序进行的。密钥对都是整数。键是32位的,检索到的值是8位的。规模是最重要的因素

如果有一种方法可以调整整数键的Gperf,或者仅仅是另一种方法,我也洗耳恭听。:)

(旁注:……当我输入这个问题时,我意识到二进制搜索可能会占上风,而我刚刚对这个问题进行了仔细思考。不过,为了学习,我还是想听听你的想法!)

编辑:关键点分布不均匀。大多数在整个可能范围内随机聚集


编辑2:最糟糕的情况是,二进制搜索对我来说太慢了,所以我最终玩了这些键,直到我从每个键中找到了8个比特,用来生成256个均匀分布的存储桶。我保存了每个bucket的最小值和最大值(每个bucket条目24位),并为密钥对制作了一个大的struct数组。在我的特殊情况下,与我测试的其他产品相比,速度更快、体积更小,所以我想我现在就要这么做了。:)

对密钥进行排序,并使用M-树检索任何密钥

M-树的每个节点有M个条目,而不是二进制文件的2个条目。 这将极大地提高性能。 使用缓存线大小作为节点大小的基础,因此为64字节。 您可以在此大小中存储16个32位值

因为您有1000个值,所以3个级别足以检索正确的键(与二叉树的10个级别相反)

另一个想法是将密钥散列到一个小的散列表中,例如12位1(4K条目),并用一个简单的链解决潜在的冲突。您很可能在一次搜索中获得大部分密钥。

您尝试过吗?也许正是你需要的

/*
** Proof of concept for constructing a {fixed-size,lookup-only} hashtable
** needing only (2*N* sizeof(int)) additional storage for storing N num=value pairs.
** The key is supposed to be an int,
** the 'value' is a char.
** Note: some people would like to include <stdint.h> and replace all the ints by {uint32_t,uint16_t,uint8_t}.
**
** (c)opyright Wildplasser, 2011-11-12
** href = http://stackoverflow.com/questions/8059591/lookups-on-known-set-of-integer-keys
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>

struct tinyhash {
    unsigned key;
    unsigned short link;
    unsigned char value;
    unsigned is_linked :1;
    };
#define LINK_DEAD ((unsigned short)-1)

    /* A hashtable with N slots for N entries (100% full) */
#define THE_SIZE 1000
struct tinyhash table[THE_SIZE] ;

void tiny_fill(void);
void tiny_init(void);
int tiny_find(unsigned key);

void tiny_print(void);
void tiny_count(void);

static int tiny_cmp( const void *l, const void *r);
static unsigned short * tiny_hnd(unsigned key);
static unsigned tiny_hash(unsigned key);

int main (void)
{

assert(sizeof table == 2 * THE_SIZE * sizeof(int));

fprintf (stderr, "Size=%u\n", (unsigned int) sizeof table);

tiny_fill();
tiny_init();
tiny_print();
tiny_count();

return 0;
}
    /* Perform a table lookup.
    ** Return the "value" that belongs to "key"..
    ** If not found -1 is returned.
    */
int tiny_find(unsigned key)
{
unsigned short *ptr;
ptr = tiny_hnd(key);
return *ptr==LINK_DEAD ? -1 : table[*ptr].value ;
}

    /* Find the place where key is located, or
    ** (if not found) where it should be appendend.
    ** The returned value is a pointer to the parent's .link field.
    */
static unsigned short * tiny_hnd(unsigned key)
{
unsigned hash;
unsigned short slot, *ptr;

hash = tiny_hash(key);
slot = hash % THE_SIZE;
for (ptr = &table[slot].link; *ptr != LINK_DEAD; ptr = &table[*ptr].link ) {
    if ( !table[*ptr].is_linked ) break;
    if ( table[*ptr].key == key) break;
    }
return ptr;
}

    /* For testing: fill hashtable with garbage */
void tiny_fill(void)
{
unsigned idx;
for (idx=0; idx < THE_SIZE; idx++ ) {
    table[idx].key = rand() + 543 * rand();
    table[idx].value = rand() ;
        table[idx].link = LINK_DEAD;
        table[idx].is_linked = 0;
    }
}
    /* Build hashtable, that is:
    ** shuffle the entries and build linked list.
    */
void tiny_init(void)
{
unsigned idx;

    /* Phase 0: set all to unused.
    ** The link field is temporaly abused to store the intended
    ** slotnumber.
    */
for (idx=0; idx < THE_SIZE; idx++ ) {
        table[idx].link = tiny_hash(table[idx].key) % THE_SIZE;
        table[idx].is_linked = 0;
    }

    /* Phase 0a: sort on (intended) slotnumber. */
qsort (table, THE_SIZE, sizeof table[0] , tiny_cmp);

    /* Phase 1: put enties at their intended slotnumber
    ** but only if the entry that lives there does not belong there
    ** (is uninitialized).
    */
for (idx=0; idx < THE_SIZE; idx++) {
    unsigned slot;
        /* [idx] has allready been placed */
    if (table[idx].is_linked) continue;
    slot = table[idx].link;
         /* [idx] belongs here. freeze it */
    if (slot==idx) {
        table[idx].link = LINK_DEAD;
        table[idx].is_linked = 1;
        }
        /* try to swap [idx] with the item at its intended slot */
    else {
        struct tinyhash tmp;
            /* item[slot] belongs there: keep off */
        if (table[slot].is_linked) continue;
        tmp = table[idx];
        table[idx] = table[slot];
        table[slot] = tmp;
        table[slot].is_linked = 1;
        table[slot].link = LINK_DEAD;
            /* Don't bump idx: [idx] and [slot] have been swapped,
            ** we need to inspect the new item[idx] at the next cycle.
            */
        idx--; /* idx will be re-incremented in the loop; */
        }
    }

    /* Phase 2: link any remaining unplaced item to its
    ** parent in the LL-chain.
    */
for (idx=0; idx < THE_SIZE; idx++ ) {
    unsigned short *parent;
    if (table[idx].is_linked) continue;
    parent = tiny_hnd(table[idx].key);
    if (*parent != LINK_DEAD) continue; /* duplicate */
    *parent = idx;
    table[idx].is_linked = 1;
    table[idx].link = LINK_DEAD;
    }
}
    /* Compare function for qsort()
    */
static int tiny_cmp( const void *vl, const void *vr)
{
struct tinyhash *l = (struct tinyhash *)vl;
struct tinyhash *r = (struct tinyhash *)vr;

#if 0
unsigned slot_l, slot_r;
slot_l = tiny_hash(l->key) %THE_SIZE;
slot_r = tiny_hash(r->key) %THE_SIZE;
if (slot_l < slot_r ) return -3;
if (slot_l > slot_r ) return 3;
#else
if (l->link < r->link ) return -3;
if (l->link > r->link ) return 3;
#endif

if (l->key < r->key) return -2;
if (l->key > r->key) return 2;

if (l < r) return -1;
if (l > r) return 1;

return 0;
}

    /* Stupid hashfunction, to be replaced with something usefull..
    ** (works good for random ints) Use at your own risk.
    */
static unsigned tiny_hash(unsigned key)
{
return key * 98765;
}

void tiny_print(void)
{
unsigned idx;

for (idx=0; idx < THE_SIZE; idx++ ) {
    unsigned slot;
    int dir;
    slot = tiny_hash(table[idx].key) % THE_SIZE;
    dir = (slot==idx) ? '=' : (slot>idx) ? '<':  '>';
    fprintf(stderr, "[%4x] %c %4x: %4x %c %10u->%3u\n"
    , idx, dir, 0xffff & slot
    , 0xffff & table[idx].link
    , table[idx].is_linked ? '*' : '.'
    , table[idx].key,table[idx].value
    );
    }
}
    /* For testing: print the hashchains, construct a histogram of chainlengths,
    ** and calculate the "total cost of retrieval".
    */
void tiny_count(void)
{
unsigned idx, cnt, len, tothops, slot;
unsigned histogram[THE_SIZE];

memset(histogram, 0, sizeof histogram);

cnt=tothops=0;
for (slot =0; slot < THE_SIZE; slot++ ) {
    idx = tiny_hash(table[slot].key) % THE_SIZE;
    if (slot!=idx) continue; /* this is not the start of a chain */
    for (len=0    ; idx != LINK_DEAD; idx = table[idx].link) {
        if (!table[idx].is_linked) continue;
        if (len==0) fprintf(stderr, "[%u]:", slot);
        fprintf(stderr, " %u", table[idx].key);
        len++;
        }
    fprintf(stderr, "(=%u)\n", len);
    cnt += len;
    histogram[len] += 1;
    tothops += (((len+1) *len)/2);
    }

fprintf(stderr, "Histogram of chainlengths:\n");
for (len=0; len < THE_SIZE; len++) {
    if (!histogram[len]) continue;
    fprintf(stderr, "[%u]: %u\n", len, histogram[len]);
    }
fprintf(stderr, "tothops=%u/%u (%f hops per node)\n"
    , tothops, cnt, (double)tothops/cnt);
}

注意:由于在初始化哈希表时进行排序,条目非常接近预期的位置。这增加了引用的位置。

您可以将Lua用作哈希表库。但对于1000个条目,您可能最好搜索一个排序的数组,如您所述。您是否也需要插入/删除,或者它是一个静态查找表?@wildplasser static table.:)如果您能够提供一个2*1000整数的表,那么哈希表可能是可行的。可能需要一些位调整,结合有效负载和溢出指针。(也可能是一个in_use-bit)哈希表工作正常,每个操作使用2*N个整数的空间和~1.5个探测。如果没有一个完美的散列(对于1000个条目来说,这几乎是不可能的)接受它,这大概是你能得到的最好结果了。谢谢你的解释。我要修改一下那个主意。:)不幸的是,Judy数组正是我试图用静态表击败的!我不需要插入/删除,所以我想摆脱由此带来的任何开销(
....
[978]: 1794172570(=1)
[980]: 642121828(=1)
[983]: 2674104091(=1)
[985]: 547527125(=1)
[986]: 2383911594(=1)
[988]: 4254837348(=1)
[989]: 1860851825 1990214465 1766290905(=3)
[990]: 3793608270 469685686(=2)
[992]: 1189958296 872917240(=2)
[994]: 1999781290 1501026482(=2)
[995]: 520334159 211600319(=2)
[997]: 177583921(=1)
[998]: 1173163646 1013572158(=2)
[999]: 1705614211 3460318251(=2)
Histogram of chainlengths:
[1]: 369
[2]: 190
[3]: 57
[4]: 15
[5]: 4
tothops=1491/1000 (1.491000 hops per node)