C 在这种情况下,有可能生成最小完美哈希函数吗?

  • 为什么它必须是线性存储
  • 表中的删除是否常见(您只指定在初始创建后不会添加键值对)
  • 与散列的范围相比,表可能增长多大
  • 与重复数据相比,插入的频率有多高
  • 记忆是一个重要因素吗
  • 虽然完美的散列是不可能的,但是如果你可以简单地拥有一个简单的链表,它的bucket大小至少是你潜在的唯一散列平均值的两个标准差,那么它就完全是学术性的了。它的内存很小(当然,相对而言,取决于总的潜在大小),易于删除,只要问题3的答案是“小得多”,它的查找时间就接近O(1)


    #include <stdlib.h>
    #include <string.h>
    #include <stdint.h>
    // Dummy value type to test compile. Replace throughout 
    #define YOUR_VALUE_T int
    // See below where the charmap is
    //#define HTABLE_USE_CHARMAP
    // Maintain a true linked list that's manageable and iterateable
    // Count lookup misses and such
    // Fast deletion = faster deletion but more memory consumption
    // This is used to quickly collapse the input from full 8-bit to the minimal character set of truely expected data.
    // The idea here is to boil down the data. This should only be done if you're confident enough to develop a custom
    // hashing algorithm for this particular known range
    const char hashing_charmap[256] = {
       // Each data point that is unused (such as control characters or high 8-bit characters)
       // should be 0, while each used character should be represented with a unique sequential value (1, 2, 3, etc)
       // I'm not going to build this for you because it's very custom to your needs.
       // A chunk might look look like...
       0, 0, 0, 0, 17, 18, 19, 0, 0, 20, 21,
    static inline uint32_t hash_str(register const char* s, const size_t len) {
       register uint32_t hash = 5381; // hash seed here. This could be different depending on the actual algorithm chosen
       register char symbol;
       // This could be unrolled because we known string length as well.
       for (register size_t i=0; i < len; i++) {
          #ifdef HTABLE_USE_CHARMAP
          if (!(symbol = hash_charmap[s[i]]))
          // Actually s[i] could simply be used (which would be faster) if no mapping is needed.
          symbol = s[i];
          // True hash algorithm per-symbol operation here
          Keep in mind that certain algorithms are optimized for certain things.
          An example:
          Stock DJBX33A is very fast but effectively only represents the end of a long input. It's really meant for short inputs (like variable names)
          A MurmurHash or tuned FNV variant are likely to be a good picks since we've reduced symbol range and we are dealing with potential long inputs.
          It's also important to understand that the entire hash will likely not be used. Only the lower-end bits will be used
          (you'll see why in the actual functionality). If you're hashing algorithm is good though, this shouldn't matter because
          the distribution should be normal.
          I'll just use Jenkins one-at-a-time hash here (because it's easy)
          hash += symbol;
          hash += (hash << 10);
          hash ^= (hash >> 6);
       // Finialize jenkins one-at-a-time
       hash += (hash << 3);
       hash ^= (hash >> 11);
       hash += (hash << 15);
       return hash;
    typedef struct _hash_entry {
       char* key;
       size_t key_len;
       uint32_t hash;
       // Whatever your value type is (likely a pointer to your own record or something)
       YOUR_VALUE_T value;
       // Internal linking maintains order.
       // If you don't need proper order maintentence, you don't need these
       struct _hash_entry* prev;
       struct _hash_entry* next;
       struct _hash_entry* bucket_prev;
       // This is required for the occassional hash miss
       struct _hash_entry* bucket_next;
    } hash_entry_t;
    typedef struct _hash_table {
       // Counts
       size_t entry_count;
       uint32_t bucket_count;
       unsigned int growth_num;
       unsigned int growth_den;
       #ifdef HTABLE_KEEP_STATS
       // How many times we missed during lookup
       size_t misses;
       // (entry_count - used_buckets) tells you how many collisions there are (the lower the better)
       uint32_t used_buckets;
       // Internal linking. Same conditions as in hash_entry_t so feel free to remove as necessary.
       hash_entry_t* first;
       hash_entry_t* last;
       // Buckets, the soul of the hash table
       uint32_t hash_mask;
       hash_entry_t** buckets;
    } hash_table_t;
    // Creates a hash table
    // size_hint - Tells to table how many buckets it should initially allocate.
    //    If you know (for example) that you'll have about 500 entries, set it
    //    to 500
    // growth_num and growth_den - This is the ratio of how many entries to how
    //    many buckets that you want to guarantee.
    //    It's in two integers to avoid floating point math for speed.
    //    The logic after an insertion is...
    //       if (entry_count == growth_num * (bucket_count / growth_den)) then
    //          grow the bucket array
    //    For example, when growth_num is 4 and growth_den is 5...
    //       (entry_count == 4 * (bucket_count / 5))
    //   ...would be true when entry count is 80% of the bucket count
    //    This can result in a greater than 1.0 ratio (such as 5/4 or something
    //    like that) if you prefer. This would mean that there are less buckets
    //    than there are entries, so collisions are guaranteed at that point, but
    //    you would save on both memory and often a bucket expansion occurs (which
    //    is costly during an insert operation).
    static hash_table_t* htable_create(const size_t size_hint, const unsigned int growth_num, const unsigned int growth_den);
    // Frees a hash table
    static void htable_free(hash_table_t* table);
    // Mostly used internally. You probably want htable_get(), htable_value(), or htable_exists()
    static hash_entry_t* htable_find_entry(hash_table_t* table, const char* key, size_t key_len, uint32_t* hash, size_t* true_len);
    // Get the pointer to a value stored in the table (or NULL on non-existant)
    static YOUR_VALUE_T* htable_value(const hash_table_t* table, const char* key, size_t key_len);
    // Get the value of an entry, or the default value if the entry doesn't exist
    static YOUR_VALUE_T htable_get(const hash_table_t* table, const char* key, size_t key_len, const YOUR_VALUE_T default_value);
    // Test for the existance of a value
    static int htable_exists(const hash_table_t* table, const char* key, size_t key_len);
    // Add a new entry (but don't update if it already exists). Returns NULL if it already exists
    static hash_entry_t* htable_add(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value);
    // Update an entry OR add a a new entry it doesn't already exist
    static hash_entry_t* htable_set(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value);
    // Update an entry but don't add a a new entry it doesn't already exist. Returns NULL if doesn't exist
    static hash_entry_t* htable_update(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value);
    // Delete an entry. Returns 1 on success or 0 if the entry didn't exist
    static int htable_delete(hash_table_t* table, const char* key, size_t key_len);
    // Pack the table.
    // This is here because...
    // - If HTABLE_FAST_DELETES is set, and if you delete a bunch of entries, it's
    //   possible that you can free up some memory by shrinking the bucket array.
    //   You would have to call this manually to make that happen.
    // - If HTABLE_FAST_DELETES is NOT set however, this get's called automatically
    //   on each delete, so the buckets are guaranteed to be packed.
    static void htable_pack(hash_table_t* table);
    static hash_table_t* htable_create(const unsigned long size_hint, const unsigned int growth_num, const unsigned int growth_den) {
       hash_table_t* res = malloc(sizeof(hash_table_t));
       if (!res)
          return NULL;
       res->entry_count = 0;
       res->first = NULL;
       res->last = NULL;
       #ifdef HTABLE_KEEP_STATS
       res->misses = 0; 
       res->used_buckets = 0;
       if ((!growth_num) || (!growth_den)) {
          // Grow only when the entry count matches the bucket count
          res->growth_num = 1;
          res->growth_den = 1;
       } else {
          res->growth_num = growth_num;
          res->growth_den = growth_den;
       For computational speed and simplicity we'll grow the bucket array exponentially.
       Not growing the buckets exponentially is possible but requires a different
       entry lookup mechanism (because hash & hash_mask would no longer work) and would 
       likely involve the modulas operator which is very slow. If memory is uber important
       however, this might be a good solution.
       // We'll go ahead and assume it's a reasonably small table and only allocate 256 buckets.
       int bits = 8;
       if (size_hint) {
          unsigned long target = (size_hint * res->growth_den) / res->growth_num;
          // First check is to prevent overflow as it would be 0 when bits is 31 on a 32 bit system
          while ((1 << (bits + 1)) && ((1 << bits) < target))
       res->bucket_count = 1 << bits;
       res->hash_mask = (1 << bits) - 1;
       if ((res->buckets = (hash_entry_t**)calloc(res->bucket_count, sizeof(hash_entry_t*))) == NULL) {
          return NULL;
       memset(res->buckets, 0, sizeof(hash_entry_t*) * res->bucket_count);
       return res;
    // Destroy a table
    static void htable_free(hash_table_t* table) {
       hash_entry_t* entry;
       hash_entry_t* next;
          entry = table->first;
          while (entry) {
             next = entry->next;
             entry = next;
          for (uint32_t i=0; i < table->bucket_count; i++) {
             entry = table->buckets[i];
             while (entry) {
                next = entry->bucket_next;
                entry = next;
    // Find an entry: (mostly used internally)
    // returns NULL when the entry isn't found
    static hash_entry_t* htable_find_entry(hash_table_t* table, const char* key, size_t key_len, uint32_t* hash, size_t* true_len) {
       if (!key_len)
          key_len = strlen(key);
       if (true_len != NULL)
          *true_len = key_len;
       uint32_t h = hash_str(key, key_len);
       if (hash != NULL)
          *hash = h;
       uint32_t bucket = h & table->hash_mask;
       // Best case is here is O(1) because table->buckets[bucket] would be the entry
       hash_entry_t* entry = table->buckets[bucket];
       // ... but if we miss, then the time increases to as much as O(n) where n is the number of entries in
       // the particular bucket (good hash + good ratio management means that n would usually be only 1)
       while ((entry) && ((entry->hash != h) || (entry->key_len != key_len) || (memcmp(entry->key, key, key_len)))) {
          #ifdef HTABLE_KEEP_STATS
          entry = entry->bucket_next;
       return entry;
    // Insertion of entry into bucket. Used internally
    static inline int _htable_bucket_insert(hash_entry_t** buckets, hash_entry_t* entry, const uint32_t hash_mask) {
       hash_entry_t* bentry;
          entry->bucket_prev = NULL;
       entry->bucket_next = NULL;
       uint32_t bidx = entry->hash & hash_mask;
       int res = 0;
       if ((bentry = buckets[bidx]) == NULL) {
          res = 1;
          buckets[bidx] = entry;
       } else {
          while (bentry->bucket_next)
             bentry = bentry->bucket_next;
          bentry->bucket_next = entry;
          #ifdef HTABLE_FAST_DELETES
             entry->bucket_prev = bentry;
       return res;
    // Bucket array growing/shrinking. Used internally
    static void _htable_adjust_as_needed(hash_table_t* table) {
       int change = (((table->bucket_count << 1) != 0) && (table->entry_count >= table->growth_num * (table->bucket_count / table->growth_den)));
       if (!change) {
          if ((table->bucket_count > (1 << 8)) && (table->entry_count < table->growth_num * ((table->bucket_count >> 1) / table->growth_den))) {
             change = -1;
          } else {
       uint32_t new_bucket_count = (change < 0) ? table->bucket_count >> 1 : table->bucket_count << 1;
       uint32_t new_hash_mask = new_bucket_count - 1;
       hash_entry_t** new_buckets = (hash_entry_t**)calloc(new_bucket_count, sizeof(hash_entry_t*));
       if (!new_buckets)
       memset(new_buckets, 0, new_bucket_count * sizeof(hash_entry_t*));
       #ifdef HTABLE_KEEP_STATS
          table->used_buckets = 0;
       hash_entry_t* entry;
          entry = table->first;
          while (entry) {
             int r = _htable_bucket_insert(new_buckets, entry, new_hash_mask);
             #ifdef HTABLE_KEEP_STATS
             table->used_buckets += r;
             entry = entry->next;
          hash_entry_t* next;
          for (uint32_t i=0; i < table->bucket_count; i++) {
             entry = table->buckets[i];
             while (entry) {
                next = entry->bucket_next;
                int r = _htable_bucket_insert(new_buckets, entry, new_hash_mask);
                #ifdef HTABLE_KEEP_STATS
                table->used_buckets += r;
                entry = next;
       table->buckets = new_buckets;
       table->bucket_count = new_bucket_count;
       table->hash_mask = new_hash_mask;
    // Get the pointer to the value of the entry or NULL if not in table
    static YOUR_VALUE_T* htable_value(const hash_table_t* table, const char* key, size_t key_len) {
       // un-const table so that find_entry can keep statistics
       hash_entry_t* entry = htable_find_entry((hash_table_t*)table, key, key_len, NULL, NULL);
       return (entry != NULL) ? &entry->value : NULL;
    static YOUR_VALUE_T htable_get(const hash_table_t* table, const char* key, size_t key_len, const YOUR_VALUE_T default_value) {
       // un-const table so that find_entry can keep statistics
       hash_entry_t* entry = htable_find_entry((hash_table_t*)table, key, key_len, NULL, NULL);
       return (entry != NULL) ? entry->value : default_value;
    static int htable_exists(const hash_table_t* table, const char* key, size_t key_len) {
       // un-const table so that find_entry can keep statistics
       return (htable_find_entry((hash_table_t*)table, key, key_len, NULL, NULL) != NULL);
    // Add a new entry (but don't update if it already exists)
    // Returns NULL if the entry already exists (use set() if you want add or update logic)
    static hash_entry_t* htable_add(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value) {
       uint32_t hash;
       hash_entry_t* res = htable_find_entry(table, key, key_len, &hash, &key_len);
       if (res != NULL)
          return NULL;
       if ((res = (hash_entry_t*)malloc(sizeof(hash_entry_t))) == NULL)
          return NULL;
       if ((res->key = (char*)malloc(key_len + 1)) == NULL) {
          return NULL;
       memcpy(res->key, key, key_len + 1);
       res->key_len = key_len;
       res->hash = hash;
       res->value = value;
       res->prev = NULL;
       res->next = NULL;
       if (table->first == NULL) {
          table->first = res;
          table->last = res;
       } else {
          res->prev = table->last;
          table->last->next = res;
          table->last = res;
       int r = _htable_bucket_insert(table->buckets, res, table->hash_mask);
       #ifdef HTABLE_KEEP_STATS
          table->used_buckets += r;
       return res;
    static hash_entry_t* htable_set(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value) {
       uint32_t hash;
       hash_entry_t* res = htable_find_entry(table, key, key_len, &hash, &key_len);
       if (res != NULL) {
          res->value = value;
          return res;
       if ((res = (hash_entry_t*)malloc(sizeof(hash_entry_t))) == NULL)
          return NULL;
       if ((res->key = (char*)malloc(key_len + 1)) == NULL) {
          return NULL;
       memcpy(res->key, key, key_len + 1);
       res->key_len = key_len;
       res->hash = hash;
       res->value = value;
       res->prev = NULL;
       res->next = NULL;
       if (table->first == NULL) {
          table->first = res;
          table->last = res;
       } else {
          res->prev = table->last;
          table->last->next = res;
          table->last = res;
       int r = _htable_bucket_insert(table->buckets, res, table->hash_mask);
       #ifdef HTABLE_KEEP_STATS
          table->used_buckets += r;
       return res;
    // Update an entry but don't add a a new entry it doesn't already exist. Returns NULL if doesn't exist
    static hash_entry_t* htable_update(hash_table_t* table, const char* key, size_t key_len, YOUR_VALUE_T value) {
       hash_entry_t* res = htable_find_entry(table, key, key_len, NULL, NULL);
       if (res == NULL)
          return NULL;
       res->value = value;
       return res;
    // Delete an entry. Returns 1 on success or 0 if the entry didn't exist
    static int htable_delete(hash_table_t* table, const char* key, size_t key_len) {
       uint32_t hash;
       hash_entry_t* entry = htable_find_entry(table, key, key_len, &hash, &key_len);
       if (entry == NULL)
          return 0;
          if (entry == table->first)
             table->first = entry->next;
          if (entry == table->last) {
             table->last = entry->prev;
          if (entry->prev != NULL)
             entry->prev->next = entry->next;
          if (entry->next != NULL)
             entry->next->prev = entry->prev;
       uint32_t bucket = hash & table->hash_mask;
       hash_entry_t* bhead = table->buckets[bucket];
       hash_entry_t* bprev = NULL;
          bprev = entry->bucket_prev;
          if (bhead != entry) {
             bprev = bhead;
             while (bprev->bucket_next != entry)
                bprev = bprev->bucket_next;
       if (bprev != NULL)
          bprev->bucket_next = entry->bucket_next;
          if (entry->bucket_next != NULL)
             entry->bucket_next->bucket_prev = entry->bucket_next;
       if (bhead == entry) {
          table->buckets[bucket] = entry->bucket_next;
          #ifdef HTABLE_KEEP_STATS
             if (entry->bucket_next == NULL)
       #ifndef HTABLE_FAST_DELETES
       return 1;
    static void htable_pack(hash_table_t* table) {

    • 使用默认参数进行随机输入后:
      • 参赛人数:10万
      • 电话:131072
      • 二手桶:69790
      • 电话:30210
      • 错过:71394
      • 哈希/桶效率:69.79%
    • 使用1/2的增长率进行随机输入后:
      • 参赛人数:10万
      • 电话:262144
      • 二手桶:83181
      • 电话:16819
      • 错过:35436
      • 哈希/桶效率:83.18%
    • 使用2/1的增长率进行随机输入后:
      • 参赛人数:10万
      • 电话:65536
      • 二手桶:51368
      • 电话:48632
      • 失踪人数:141607
      • 哈希/桶效率:51.37%




  • 这些断言/测试对我来说很有效,但不能保证没有bug。它看起来确实很稳定。可能有一两只虫子在里面漂浮
  • 如果需要列表管理,您可以通过管理
  • 我无法包含测试代码,因为我似乎已经达到了答案大小的限制
  • 时间分析中不包括哈希函数或最终字符串比较。如果不了解有关输入的所有统计信息,就不可能对其进行分析。不过,这两个函数都应该非常快,如果知道有关输入数据的更多信息,则可以完全排除字符串比较
  • 试试GPL'd,或者


    • 接收查询字符串并通过枚举键列表来识别完美哈希函数的域

    • 将这些键和列表大小(范围为1..size)提供给从上述参考实现派生的完美哈希生成函数

    • 使用生成的完美哈希函数创建哈希映射

    • 使用相同的完美哈希函数来处理HashMap中的

    hash_table_t* ht = htable_create(0, 0, 0);
    assert(ht != NULL);  // Table was created successfully
    // Testing basic adding/updating/getting...
    assert(htable_add(ht, "hello-world", 0, 234) != NULL); // hello-world set to 234
    assert(htable_add(ht, "goodbye-world", 0, 567) != NULL); // goobye-world set to 567
    assert(ht->entry_count == 2); // Two entries exist (hello-world and goodbye-world)
    assert(htable_exists(ht, "hello-world", 0) == 1); // hello-world exists
    assert(htable_exists(ht, "goodbye-world", 0) == 1); // goodbye-world exists
    assert(htable_exists(ht, "unknown-world", 0) == 0); // unknown-world doesn't exist
    assert(*htable_value(ht, "hello-world", 0) == 234); // hello-world has a value of 234
    assert(*htable_value(ht, "goodbye-world", 0) == 567); // goodbye-world has a value of 567
    assert(htable_get(ht, "hello-world", 0, -1) == 234); // hello-world exists and has a value of 234
    assert(htable_get(ht, "goodbye-world", 0, -1) == 567); // goobye-world exists and has a value of 567
    assert(htable_get(ht, "unknown-world", 0, -1) == -1); // unknown-world does not exist so the default value of -1 is returned
    *htable_value(ht, "hello-world", 0) = -1; // hello-world's value is directly set via reference to -1
    *htable_value(ht, "goodbye-world", 0) = -2; // goodbye-world's value is directly set via reference to -2
    assert(*htable_value(ht, "hello-world", 0) == -1); // hello-world has a value of -1
    assert(*htable_value(ht, "goodbye-world", 0) == -2); // goodbye-world has a value of -2
    assert(htable_update(ht, "hello-world", 0, 1000) != NULL); // hello-world set to 1000
    assert(htable_update(ht, "goodbye-world", 0, 2000) != NULL); // goodbye-world set to 2000
    assert(htable_update(ht, "unknown-world", 0, 3000) == NULL); // unknown-world not set (it doesn't exist);
    assert(ht->entry_count == 2); // Two entries exist (hello-world and goodbye-world)
    assert(htable_set(ht, "hello-world", 0, 1111) != NULL); // hello-world set to 1111
    assert(htable_set(ht, "goodbye-world", 0, 2222) != NULL); // goodbye-world set to 2222
    assert(htable_set(ht, "unknown-world", 0, 3333) != NULL); // unknown-world added with a value of 3333
    assert(ht->entry_count == 3); // Three entries exist (hello-world, goodbye-world, and unknown-world)
    printf("%s\n", "After all additions and changes:");
    // A foreach iteration
    hash_entry_t* entry = ht->first;
    while (entry != NULL) {
       printf("\"%s\" = %i\n", entry->key, entry->value);
       entry = entry->next;
    assert(ht->entry_count - ht->used_buckets == 0); // Means that no hash collisions occured
    assert(ht->misses == 0); // Means that each lookup was in O(1) time
    // Testing basic deletion...
    assert(htable_delete(ht, "not-a-world", 0) == 0); // not-a-world not deleted (doesn't exist)
    assert(htable_delete(ht, "hello-world", 0) == 1); // hello-world deleted
    assert(htable_delete(ht, "hello-world", 0) == 0); // hello-world not deleted (doesn't exist)
    assert(htable_exists(ht, "hello-world", 0) == 0); // hello-world doesn't exit
    assert(htable_exists(ht, "goodbye-world", 0) == 1); // goobye-world still exists
    assert(htable_exists(ht, "unknown-world", 0) == 1); // unknown-world still exists
    assert(ht->entry_count == 2); // Two entries exists (goodbye-world and unknown-world)
    assert(htable_delete(ht, "unknown-world", 0) == 1); // unknown-world deleted
    assert(htable_exists(ht, "unknown-world", 0) == 0); // unknown-world doesn't exit
    assert(htable_exists(ht, "goodbye-world", 0) == 1); // goodbye-world still exists
    assert(ht->entry_count == 1); // One entry exists (goodbye-world)
    // A foreach iteration
    printf("%s\n", "After deletion:");
    entry = ht->first;
    while (entry != NULL) {
       printf("\"%s\" = %i\n", entry->key, entry->value);
       entry = entry->next;
    assert(ht->entry_count - ht->used_buckets == 0); // Means that no hash collisions occured
    assert(ht->misses == 0); // Means that each lookup was in O(1) time
    get(field, table) 
       return table[perfect(field)];
    Start with a number that is a prime (all primes are relative primes)
    while (more bytes to be considered) {
      take the next byte of input and multiply it by a second prime
      determine the number of bits that might be lost in a left shift, capture them in a buffer
      shift the bits in the hash "buffer" to the left.
      restore the high order bit(s) in the low position
      take the next byte of input and multiply it by a second prime
      mask the multiplied result into the buffer 