C++ 自定义分配器性能

C++ 自定义分配器性能,c++,performance,memory-management,allocator,C++,Performance,Memory Management,Allocator,我正在构建一个AVL树类,它将有一个固定的最大项数。所以我想,与其单独分配每个项,不如一次分配整个块,并在需要时使用位图分配新内存 我的分配/解除分配代码: avltree::avltree(UINT64 numitems) { root = NULL; if (!numitems) buffer = NULL; else { UINT64 memsize = sizeof(avlnode) * numitems + bitlist::storagesize(num

我正在构建一个AVL树类,它将有一个固定的最大项数。所以我想,与其单独分配每个项,不如一次分配整个块,并在需要时使用位图分配新内存

我的分配/解除分配代码:

avltree::avltree(UINT64 numitems)
{
  root = NULL;

  if (!numitems)
    buffer = NULL;
  else {
    UINT64 memsize = sizeof(avlnode) * numitems + bitlist::storagesize(numitems);
    buffer = (avlnode *) malloc(memsize);
    memmap.init(numitems, buffer + numitems);
    memmap.clear_all();
    freeaddr = 0;
  }
}

avlnode *avltree::newnode(keytype key)
{
  if (!buffer)
    return new avlnode(key);
  else 
  {
    UINT64 pos;
    if (freeaddr < memmap.size_bits)
      pos = freeaddr++;
    else
      pos = memmap.get_first_unset();
    memmap.set_bit(pos);
    return new (&buffer[pos]) avlnode(key);
  }
}

void avltree::deletenode(avlnode *node)
{
  if (!buffer)
    delete node;
  else
    memmap.clear_bit(node - buffer);
}
avltree::avltree(UINT64 numitems)
{
root=NULL;
如果(!numitems)
缓冲区=空;
否则{
UINT64 memsize=sizeof(avlnode)*numitems+bitlist::storagesize(numitems);
缓冲区=(avlnode*)malloc(memsize);
init(numitems,buffer+numitems);
memmap.clear_all();
freeaddr=0;
}
}
avlnode*avltree::newnode(键类型键)
{
如果(!缓冲区)
返回新的avlnode(键);
其他的
{
UINT64位置;
if(freeaddr
为了使用标准的new/delete,我必须用numitems==0构造树。为了使用我自己的分配器,我只传递项目数。所有函数都是内联的,以获得最佳性能

这一切都很好,但我自己的分配器比new/delete慢20%左右。现在,我知道内存分配器有多复杂了,代码的运行速度不可能比数组查找+一个位集快,但这里就是这样。更糟糕的是:我的deallocator速度较慢,即使我从中删除了所有代码

当我检查程序集输出时,分配器的代码路径上布满了处理位图、avltree或avlnode的QWORD PTR指令。对于new/delete路径,似乎没有太大不同

例如,avltree::newnode的程序集输出:

;avltree::newnode, COMDAT
mov     QWORD PTR [rsp+8], rbx
push    rdi
sub     rsp, 32

;if (!buffer)
cmp     QWORD PTR [rcx+8], 0
mov     edi, edx
mov     rbx, rcx
jne     SHORT $LN4@newnode

;  return new avlnode(key);
mov     ecx, 24
call    ??2@YAPEAX_K@Z         ; operator new
jmp     SHORT $LN27@newnode

;$LN4@newnode:
;else {
;  UINT64 pos;
;  if (freeaddr < memmap.size_bits)
mov     r9, QWORD PTR [rcx+40]
cmp     r9, QWORD PTR [rcx+32]
jae     SHORT $LN2@newnode

;    pos = freeaddr++;
lea     rax, QWORD PTR [r9+1]
mov     QWORD PTR [rcx+40], rax

;  else
jmp     SHORT $LN1@newnode
$LN2@newnode:

;    pos = memmap.get_first_unset();
add     rcx, 16
call    ?get_first_unset@bitlist@@QEAA_KXZ ; bitlist::get_first_unset
mov     r9, rax

$LN1@newnode:
; memmap.set_bit(pos);
mov     rcx, QWORD PTR [rbx+16]                    ;data[bindex(pos)] |= bmask(pos);
mov     rdx, r9                                    ;return pos / (sizeof(BITINT) * 8);
shr     rdx, 6
lea     r8, QWORD PTR [rcx+rdx*8]                  ;data[bindex(pos)] |= bmask(pos);
movzx   ecx, r9b                                   ;return 1ull << (pos % (sizeof(BITINT) * 8));
mov     edx, 1
and     cl, 63
shl     rdx, cl

;   return new (&buffer[pos]) avlnode(key);
lea     rcx, QWORD PTR [r9+r9*2]
; File c:\projects\vvd\vvd\util\bitlist.h
or      QWORD PTR [r8], rdx                        ;data[bindex(pos)] |= bmask(pos)

; 195  :     return new (&buffer[pos]) avlnode(key);
mov     rax, QWORD PTR [rbx+8]
lea     rax, QWORD PTR [rax+rcx*8]
; $LN27@newnode:
test    rax, rax
je      SHORT $LN9@newnode

; avlnode constructor; 
mov     BYTE PTR [rax+4], 1
mov     QWORD PTR [rax+8], 0
mov     QWORD PTR [rax+16], 0
mov     DWORD PTR [rax], edi

; 196  :   }
; 197  : }
; $LN9@newnode:
mov     rbx, QWORD PTR [rsp+48]
add     rsp, 32                        ; 00000020H
pop     rdi
ret     0
?newnode@avltree@@QEAAPEAUavlnode@@H@Z ENDP             ; avltree::newnode
_TEXT   ENDS
;avltree::newnode,COMDAT
mov QWORD PTR[rsp+8],rbx
推动rdi
副区长,32
;如果(!缓冲区)
cmp QWORD PTR[rcx+8],0
电子数据交换
mov-rbx,rcx
jne短$LN4@newnode
;  返回新的avlnode(键);
mov ecx,24
呼叫2@YAPEAX_K@Z;新接线员
jmp短$LN27@newnode
;$LN4@newnode:
;否则{
;UINT64位置;
;if(freeaddrmovzx ecx,r9b;return 1all要研究这么少的代码很难确定,但我打赌引用的位置。带有元数据的位图与分配的内存本身不在同一缓存线上。
get_first_unset
可能是线性搜索

现在,我知道内存分配器有多复杂了,代码的运行速度不可能比数组查找+一个位集快,但这里就是这样


这甚至都不完全正确。一个像样的bucketing低碎片堆是O(1),具有非常低的恒定时间(并且实际上没有额外的空间开销)。我见过一个版本,可以归结为~18条asm指令(有一个分支)以前。这比你的代码要少得多。请记住,堆可能总体上非常复杂,但通过堆的快速路径可能非常非常快。

你的方法只在一个块中分配原始内存,然后必须为每个元素进行新的放置。将其与位图中的所有开销结合起来,这并不奇怪默认的
new
分配优于您假设的空堆

为了在分配时获得最大的速度提高,您可以在一个大数组中分配整个对象,然后从那里分配给它。如果您查看一个非常简单和人为的基准:

struct test_t {
    float f;
    int i;
    test_t* pNext;
};

const size_t NUM_ALLOCS = 50000000;

void TestNew (void)
{
    test_t* pPtr = new test_t;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = new test_t;
        pPtr = pPtr->pNext;
    }

}

void TestBucket (void)
{
    test_t* pBuckets = new test_t[NUM_ALLOCS + 2];
    test_t* pPtr = pBuckets++;

    for (int i = 0; i < NUM_ALLOCS; ++i)
    {
        pPtr->pNext = pBuckets++;
        pPtr = pPtr->pNext;
    }

}

目前,我对
TestNew()
TestClass(0)
的速度都在800ms左右,对
TestClass(NUM_ALLOCS+10)
的速度都在200ms以下。自定义分配器速度非常快,因为它以完全线性的方式在内存上运行,允许内存缓存发挥其魔力。我还使用
getticcount()
为了简单起见,只要时间在~100ms以上,它就足够精确。

仅作为参考,下面的代码是解决当前问题最有效的代码

这只是一个简单的avltree实现,但在我的2600K@4.6 GHz上,1000万次插入时达到1.7秒,相同次数的删除时达到1.4秒

#include "stdafx.h"
#include <iostream>
#include <crtdbg.h>
#include <Windows.h>
#include <malloc.h>
#include <new>

#ifndef NULL
#define NULL 0
#endif

typedef int keytype;
typedef unsigned long long UINT64;

struct avlnode;

struct avltree
{
  avlnode *root;
  avlnode *buffer;
  avlnode *firstfree;

  avltree() : avltree(0) {};
  avltree(UINT64 numitems);

  inline avlnode *newnode(keytype key);
  inline void deletenode(avlnode *node);

  void insert(keytype key) { root = insert(root, key); }
  void remove(keytype key) { root = remove(root, key); }
  int height();
  bool hasitems() { return root != NULL; }
private:
  avlnode *insert(avlnode *node, keytype k);
  avlnode *remove(avlnode *node, keytype k);
};

#pragma pack(1)
struct avlnode
{
  avlnode *left;     //left pointer
  avlnode *right;    //right pointer
  keytype key;       //node key
  unsigned char hgt; //height of the node

  avlnode(int k)
  {
    key = k;
    left = right = NULL;
    hgt = 1;
  }

  avlnode &balance()
  {
    struct F
    {
      unsigned char height(avlnode &node)
      {
        return &node ? node.hgt : 0;
      }
      int balance(avlnode &node)
      {
        return &node ? height(*node.right) - height(*node.left) : 0;
      }
      int fixheight(avlnode &node)
      {
        unsigned char hl = height(*node.left);
        unsigned char hr = height(*node.right);
        node.hgt = (hl > hr ? hl : hr) + 1;
        return (&node) ? hr - hl : 0;
      }
      avlnode &rotateleft(avlnode &node)
      {
        avlnode &p = *node.right;
        node.right = p.left;
        p.left = &node;
        fixheight(node);
        fixheight(p);
        return p;
      }
      avlnode &rotateright(avlnode &node)
      {
        avlnode &q = *node.left;
        node.left = q.right;
        q.right = &node;
        fixheight(node);
        fixheight(q);
        return q;
      }
      avlnode &b(avlnode &node)
      {
        int bal = fixheight(node);
        if (bal == 2) {
          if (balance(*node.right) < 0)
            node.right = &rotateright(*node.right);
          return rotateleft(node);
        }
        if (bal == -2) {
          if (balance(*node.left) > 0)
            node.left = &rotateleft(*node.left);
          return rotateright(node);
        }
        return node; // balancing is not required
      }
    } f;
    return f.b(*this);
  }
};

avltree::avltree(UINT64 numitems)
{
  root = buffer = firstfree = NULL;
  if (numitems) {
    buffer = (avlnode *) malloc(sizeof(avlnode) * (numitems + 1));
    avlnode *tmp = &buffer[numitems];
    while (tmp > buffer) {
      tmp->right = firstfree;
      firstfree = tmp--;
    }
  }
}

avlnode *avltree::newnode(keytype key)
{
  avlnode *node = firstfree;
  /*
  If you want to support dynamic allocation, uncomment this.
  It does present a bit of an overhead for bucket allocation though (8% slower)
  Also, if a condition is met where bucket is too small, new nodes will be dynamically allocated, but never freed
  if (!node)
  return new avlnode(key);
  */
  firstfree = firstfree->right;
  return new (node) avlnode(key);
}

void avltree::deletenode(avlnode *node)
{
  /*
  If you want to support dynamic allocation, uncomment this.
  if (!buffer)
  delete node;
  else {
  */
  node->right = firstfree;
  firstfree = node;
}

int avltree::height()
{
  return root ? root->hgt : 0;
}

avlnode *avltree::insert(avlnode *node, keytype k)
{
  if (!node)
    return newnode(k);
  if (k == node->key)
    return node;
  else if (k < node->key)
    node->left = insert(node->left, k);
  else
    node->right = insert(node->right, k);
  return &node->balance();
}

avlnode *avltree::remove(avlnode *node, keytype k) // deleting k key from p tree
{
  if (!node)
    return NULL;
  if (k < node->key)
    node->left = remove(node->left, k);
  else if (k > node->key)
    node->right = remove(node->right, k);
  else //  k == p->key 
  {
    avlnode *l = node->left;
    avlnode *r = node->right;
    deletenode(node);
    if (!r) return l;

    struct F
    {
      //findmin finds the minimum node
      avlnode &findmin(avlnode *node)
      {
        return node->left ? findmin(node->left) : *node;
      }
      //removemin removes the minimum node
      avlnode &removemin(avlnode &node)
      {
        if (!node.left)
          return *node.right;
        node.left = &removemin(*node.left);
        return node.balance();
      }
    } f;

    avlnode &min = f.findmin(r);
    min.right = &f.removemin(*r);
    min.left = l;
    return &min.balance();
  }
  return &node->balance();
}
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
  // 64 bit release performance (for 10.000.000 nodes)
  // malloc:       insertion: 2,595  deletion 1,865
  // my allocator: insertion: 2,980  deletion 2,270
  const int nodescount = 10000000;

  avltree &tree = avltree(nodescount);
  cout << "sizeof avlnode " << sizeof(avlnode) << endl;
  cout << "inserting " << nodescount << " nodes" << endl;
  LARGE_INTEGER t1, t2, freq;
  QueryPerformanceFrequency(&freq);
  QueryPerformanceCounter(&t1);
  for (int i = 1; i <= nodescount; i++)
    tree.insert(i);
  QueryPerformanceCounter(&t2);
  cout << "Tree height " << (int) tree.height() << endl;
  cout << "Insertion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;
  QueryPerformanceCounter(&t1);
  while (tree.hasitems())
    tree.remove(tree.root->key);
  QueryPerformanceCounter(&t2);
  cout << "Deletion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;

#ifdef _DEBUG
  _CrtMemState mem;
  _CrtMemCheckpoint(&mem);
  cout << "Memory used: " << mem.lTotalCount << " high: " << mem.lHighWaterCount << endl;
#endif
    return 0;
}
#包括“stdafx.h”
#包括
#包括
#包括
#包括
#包括
#ifndefnull
#定义空0
#恩迪夫
typedef int-keype;
typedef无符号长UINT64;
结构avlnode;
结构avltree
{
avlnode*根;
avlnode*缓冲区;
avlnode*firstfree;
avltree():avltree(0){};
avltree(UINT64 numitems);
内联avlnode*newnode(键类型键);
内联void deletenode(avlnode*节点);
void insert(keytype键){root=insert(root,键);}
void remove(keytype key){root=remove(root,key);}
int高度();
bool hasitems(){返回根!=NULL;}
私人:
avlnode*插入(avlnode*节点,键类型k);
avlnode*移除(avlnode*节点,键类型k);
};
#布拉格语包(1)
结构avlnode
{
avlnode*left;//左指针
avlnode*right;//右指针
keytype键;//节点键
unsigned char hgt;//节点的高度
avlnode(int k)
{
key=k;
左=右=空;
hgt=1;
}
avlnode&balance()
{
结构F
{
无符号字符高度(avlnode和node)
{
return&node?node.hgt:0;
#include "stdafx.h"
#include <iostream>
#include <crtdbg.h>
#include <Windows.h>
#include <malloc.h>
#include <new>

#ifndef NULL
#define NULL 0
#endif

typedef int keytype;
typedef unsigned long long UINT64;

struct avlnode;

struct avltree
{
  avlnode *root;
  avlnode *buffer;
  avlnode *firstfree;

  avltree() : avltree(0) {};
  avltree(UINT64 numitems);

  inline avlnode *newnode(keytype key);
  inline void deletenode(avlnode *node);

  void insert(keytype key) { root = insert(root, key); }
  void remove(keytype key) { root = remove(root, key); }
  int height();
  bool hasitems() { return root != NULL; }
private:
  avlnode *insert(avlnode *node, keytype k);
  avlnode *remove(avlnode *node, keytype k);
};

#pragma pack(1)
struct avlnode
{
  avlnode *left;     //left pointer
  avlnode *right;    //right pointer
  keytype key;       //node key
  unsigned char hgt; //height of the node

  avlnode(int k)
  {
    key = k;
    left = right = NULL;
    hgt = 1;
  }

  avlnode &balance()
  {
    struct F
    {
      unsigned char height(avlnode &node)
      {
        return &node ? node.hgt : 0;
      }
      int balance(avlnode &node)
      {
        return &node ? height(*node.right) - height(*node.left) : 0;
      }
      int fixheight(avlnode &node)
      {
        unsigned char hl = height(*node.left);
        unsigned char hr = height(*node.right);
        node.hgt = (hl > hr ? hl : hr) + 1;
        return (&node) ? hr - hl : 0;
      }
      avlnode &rotateleft(avlnode &node)
      {
        avlnode &p = *node.right;
        node.right = p.left;
        p.left = &node;
        fixheight(node);
        fixheight(p);
        return p;
      }
      avlnode &rotateright(avlnode &node)
      {
        avlnode &q = *node.left;
        node.left = q.right;
        q.right = &node;
        fixheight(node);
        fixheight(q);
        return q;
      }
      avlnode &b(avlnode &node)
      {
        int bal = fixheight(node);
        if (bal == 2) {
          if (balance(*node.right) < 0)
            node.right = &rotateright(*node.right);
          return rotateleft(node);
        }
        if (bal == -2) {
          if (balance(*node.left) > 0)
            node.left = &rotateleft(*node.left);
          return rotateright(node);
        }
        return node; // balancing is not required
      }
    } f;
    return f.b(*this);
  }
};

avltree::avltree(UINT64 numitems)
{
  root = buffer = firstfree = NULL;
  if (numitems) {
    buffer = (avlnode *) malloc(sizeof(avlnode) * (numitems + 1));
    avlnode *tmp = &buffer[numitems];
    while (tmp > buffer) {
      tmp->right = firstfree;
      firstfree = tmp--;
    }
  }
}

avlnode *avltree::newnode(keytype key)
{
  avlnode *node = firstfree;
  /*
  If you want to support dynamic allocation, uncomment this.
  It does present a bit of an overhead for bucket allocation though (8% slower)
  Also, if a condition is met where bucket is too small, new nodes will be dynamically allocated, but never freed
  if (!node)
  return new avlnode(key);
  */
  firstfree = firstfree->right;
  return new (node) avlnode(key);
}

void avltree::deletenode(avlnode *node)
{
  /*
  If you want to support dynamic allocation, uncomment this.
  if (!buffer)
  delete node;
  else {
  */
  node->right = firstfree;
  firstfree = node;
}

int avltree::height()
{
  return root ? root->hgt : 0;
}

avlnode *avltree::insert(avlnode *node, keytype k)
{
  if (!node)
    return newnode(k);
  if (k == node->key)
    return node;
  else if (k < node->key)
    node->left = insert(node->left, k);
  else
    node->right = insert(node->right, k);
  return &node->balance();
}

avlnode *avltree::remove(avlnode *node, keytype k) // deleting k key from p tree
{
  if (!node)
    return NULL;
  if (k < node->key)
    node->left = remove(node->left, k);
  else if (k > node->key)
    node->right = remove(node->right, k);
  else //  k == p->key 
  {
    avlnode *l = node->left;
    avlnode *r = node->right;
    deletenode(node);
    if (!r) return l;

    struct F
    {
      //findmin finds the minimum node
      avlnode &findmin(avlnode *node)
      {
        return node->left ? findmin(node->left) : *node;
      }
      //removemin removes the minimum node
      avlnode &removemin(avlnode &node)
      {
        if (!node.left)
          return *node.right;
        node.left = &removemin(*node.left);
        return node.balance();
      }
    } f;

    avlnode &min = f.findmin(r);
    min.right = &f.removemin(*r);
    min.left = l;
    return &min.balance();
  }
  return &node->balance();
}
using namespace std;

int _tmain(int argc, _TCHAR* argv[])
{
  // 64 bit release performance (for 10.000.000 nodes)
  // malloc:       insertion: 2,595  deletion 1,865
  // my allocator: insertion: 2,980  deletion 2,270
  const int nodescount = 10000000;

  avltree &tree = avltree(nodescount);
  cout << "sizeof avlnode " << sizeof(avlnode) << endl;
  cout << "inserting " << nodescount << " nodes" << endl;
  LARGE_INTEGER t1, t2, freq;
  QueryPerformanceFrequency(&freq);
  QueryPerformanceCounter(&t1);
  for (int i = 1; i <= nodescount; i++)
    tree.insert(i);
  QueryPerformanceCounter(&t2);
  cout << "Tree height " << (int) tree.height() << endl;
  cout << "Insertion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;
  QueryPerformanceCounter(&t1);
  while (tree.hasitems())
    tree.remove(tree.root->key);
  QueryPerformanceCounter(&t2);
  cout << "Deletion time: " << ((double) t2.QuadPart - t1.QuadPart) / freq.QuadPart << " s" << endl;

#ifdef _DEBUG
  _CrtMemState mem;
  _CrtMemCheckpoint(&mem);
  cout << "Memory used: " << mem.lTotalCount << " high: " << mem.lHighWaterCount << endl;
#endif
    return 0;
}