C++ 为什么std::vector这么快(或者我的实现太慢)

C++ 为什么std::vector这么快(或者我的实现太慢),c++,performance,c++11,stl,C++,Performance,C++11,Stl,前几天我在玩游戏,想看看我能把东西优化到什么程度。我决定从一个简单的地图开始,只做一个线性搜索,以找到元素是否存在,然后尝试优化它的大部分。另外,为了进行比较,我使用std::find对std::map和std::vector执行了相同的操作 地图的结果是预期的,创建和销毁速度比我的地图慢,但速度要快得多(实际上,我无法测量它,它始终返回0)。 问题在于std::vector。我期望它会比我的实现慢,但事实并非如此,我真的不明白它怎么会一样或更快,因为我的实现跳过了最坏的情况(该值不在向量中),

前几天我在玩游戏,想看看我能把东西优化到什么程度。我决定从一个简单的地图开始,只做一个线性搜索,以找到元素是否存在,然后尝试优化它的大部分。另外,为了进行比较,我使用std::find对std::map和std::vector执行了相同的操作

地图的结果是预期的,创建和销毁速度比我的地图慢,但速度要快得多(实际上,我无法测量它,它始终返回0)。 问题在于std::vector。我期望它会比我的实现慢,但事实并非如此,我真的不明白它怎么会一样或更快,因为我的实现跳过了最坏的情况(该值不在向量中),并且正在使用结果缓存

有人能在这里说明一下吗?我知道stl背后的人是半神,但这仍然没有意义

基准测试结果(i3、Windows 8.1 Pro 64、Visual Studio 2013):

以下是地图的代码:

template<typename T>
class LinearMap0
{
public:
LinearMap0()
{
    _end = _root = new Node;
    _prebuffer = nullptr;
    prebufferCapacity = 0;
    _alive = true;
    prebufferMarker = 0;
    _cache = _mm_set1_epi32(-1);
    for (auto& ptr : _cacheBuffer) ptr = nullptr;
    MinID = INT32_MAX - 1;
    MaxID = -1;
}
void PreAllocate(int Count)
{
    prebufferCapacity = Count;
    _prebuffer = new Node[Count];
}
~LinearMap0()
{
    if (_alive)
    {
        Release();
    }
}
void Release()
{
    Node* marker = _end;
    while (marker->Prev)
    {
        marker = marker->Prev;
        if (!marker->Next->IsPreAllocated) delete marker->Next;
    }

    if (!_root->IsPreAllocated) delete _root;
    delete[] _prebuffer;

    _alive = false;
}

void AddElement(int ID,T element)
{
    Node* tmp = nullptr;
    if (prebufferMarker < prebufferCapacity)
    {
        // Use a pre-allocated object
        tmp = &_prebuffer[prebufferMarker];
        prebufferMarker++;
        tmp->IsPreAllocated = true;
    }
    else
    {
        tmp = new Node;
    }

    tmp->ID = ID;
    tmp->Data = element;

    // Update list
    _end->Next = tmp;
    Node* prevEnd = _end;
    _end = tmp;
    _end->Prev = prevEnd;
    bool isMin = ID < MinID; MinID = ID * isMin + (1 - isMin) * MinID;
    bool isMax = ID > MaxID; MaxID = ID * isMax + (1 - isMax) * MaxID;
}
void DeleteLast()
{
    Node* tmp = _end;

    _end = _end->Prev;
    _end->Next = nullptr;

    delete tmp;
}

template<class Function>
void Loop(Function&& f, bool Forward = true)
{
    if (Forward)
    {
        Node* marker = _root;
        while (marker->Next)
        {
            marker = marker->Next;
            f(marker->Data);
        }
    }
    else
    {
        Node* marker = _end;
        while (marker->Prev)
        {
            marker = marker->Prev;
            f(marker->Data);
        }
    }
}

T* Find(int ID)
{
    // Bounds check
    if (ID < MinID || ID > MaxID) return nullptr;

    // Check it it's in the cache

    // Compare the value to every value in the cache
    __m128i idxSSE = _mm_set1_epi32(ID);
    __m128i C = _mm_cmpeq_epi32(_cache, idxSSE);

    // To change form -1 to 1
    C = _mm_mul_epi32(C, _mm_set1_epi32(-1));

    // Now C holds 1 if true, or 0 if false (in each of its 4 members). It should only be ONE set at 1
    __m128i tmp = _mm_set1_epi32(1);
    __m128i S = _mm_sub_epi32(tmp, C);

    // Now find the index
    int i = S.m128i_i32[0] * (C.m128i_i32[1] + S.m128i_i32[1] * (2 * C.m128i_i32[2] + S.m128i_i32[2] * (3 * C.m128i_i32[3] + S.m128i_i32[3] * -1)));

    if (i != -1)
        return _cacheBuffer[i];

    // Traverse the list
    Node* marker0 = _root;
    T* obj = nullptr;

    while (true)
    {
        if (marker0->ID == ID)
        {
            obj = &marker0->Data;
        }

        if (marker0->Next) marker0 = marker0->Next; else break;
    }

    // Cache value and return
    _cache.m128i_i32[cacheMarker] = ID;
    _cacheBuffer[cacheMarker] = obj;
    cacheMarker = (cacheMarker + 1) & 3; // x & 3 = x % 4

    return obj;
}
private:
struct Node
{
    Node()
    {
        Prev = nullptr;
        Next = nullptr;
        IsPreAllocated = false;
        ID = -1;
    }
    T Data;
    Node* Prev;
    Node* Next;
    bool IsPreAllocated;
    int ID;
};

Node* _root;
Node* _end;

Node* _prebuffer;
int prebufferCapacity;
int prebufferMarker;

bool _alive;

__m128i _cache;
T* _cacheBuffer[4];
int cacheMarker;
int MinID, MaxID;
};
模板
类线性化
{
公众:
LinearMap0()
{
_end=_root=新节点;
_prebuffer=nullptr;
预缓冲容量=0;
_活着=真;
预缓冲标记=0;
_缓存=_mm_set1_epi32(-1);
对于(自动&ptr:_cacheBuffer)ptr=nullptr;
MinID=INT32_MAX-1;
MaxID=-1;
}
无效预分配(整数计数)
{
预缓冲容量=计数;
_prebuffer=新节点[计数];
}
~LinearMap0()
{
如果(你还活着)
{
释放();
}
}
无效释放()
{
节点*标记=_结束;
while(标记->上一个)
{
marker=marker->Prev;
如果(!marker->Next->IsPreAllocated)删除marker->Next;
}
如果(!\u root->IsPreAllocated)删除\u root;
删除[]\u预缓冲区;
_活着=假;
}
无效加法元素(整数ID,T元素)
{
Node*tmp=nullptr;
if(预缓冲标记<预缓冲容量)
{
//使用预先分配的对象
tmp=&u prebuffer[prebufferMarker];
prebufferMarker++;
tmp->IsPreAllocated=true;
}
其他的
{
tmp=新节点;
}
tmp->ID=ID;
tmp->Data=元素;
//更新列表
_结束->下一步=tmp;
节点*prevEnd=\u end;
_end=tmp;
_结束->上一个=上一个;
bool-isMin=IDMaxID;MaxID=ID*isMax+(1-isMax)*MaxID;
}
void DeleteLast()
{
节点*tmp=\u结束;
_结束=_结束->上一步;
_结束->下一步=nullptr;
删除tmp;
}
模板
无效循环(函数&f,布尔向前=真)
{
如果(转发)
{
节点*标记=_根;
while(标记->下一步)
{
标记=标记->下一步;
f(标记->数据);
}
}
其他的
{
节点*标记=_结束;
while(标记->上一个)
{
marker=marker->Prev;
f(标记->数据);
}
}
}
T*Find(int-ID)
{
//边界检查
if(IDMaxID)返回nullptr;
//检查一下它在缓存里
//将该值与缓存中的每个值进行比较
__m128i idxSSE=_mm_set1_epi32(ID);
__m128ic=_-mm_-cmpeq_-epi32(_-cache,idxSSE);
//将表格-1更改为1
C=_mm_mul_epi32(C,_mm_set1_epi32(-1));
//现在C的4个成员中的每一个都有1个为真,或者0个为假。它应该只有一个集合为1
__m128i tmp=_mm_set1_epi32(1);
__m128i S=_mm_sub_epi32(tmp,C);
//现在找到索引
int i=S.m128i_i32[0]*(C.m128i_i32[1]+S.m128i_i32[1]*(2*C.m128i_i32[2]+S.m128i_i32[2]*(3*C.m128i_i32[3]+S.m128i_i32[3]*-1));
如果(i!=-1)
返回_cacheBuffer[i];
//遍历列表
节点*marker0=\u根;
T*obj=nullptr;
while(true)
{
if(marker0->ID==ID)
{
obj=&marker0->数据;
}
如果(marker0->Next)marker0=marker0->Next;否则中断;
}
//缓存值和返回值
_cache.m128i_i32[cacheMarker]=ID;
_cacheBuffer[cacheMarker]=obj;
cacheMarker=(cacheMarker+1)&3;//x&3=x%4
返回obj;
}
私人:
结构体类型
{
节点()
{
Prev=nullptr;
Next=nullptr;
IsPreAllocated=false;
ID=-1;
}
T数据;
节点*Prev;
节点*下一步;
布尔分布;
int-ID;
};
节点*_根;
节点*_端;
节点*\u预缓冲区;
int预缓冲容量;
int-prebufferMarker;
活蹦乱跳;
__m128i_高速缓存;
T*_缓存缓冲区[4];
int缓存标记;
int MinID,MaxID;
};
以下是基准:

// Initialize seeds
const __int64 ecount = 5 * 1000*1000;
vector<__int64> seed(ecount);
for (__int64 i = 0; i < ecount; i++)
{
    seed[i] = i;
}
random_shuffle(seed.begin(), seed.end());

///////////// std::vector

vector<__int64> v;

cout << "--------------------" << endl;
cout << "std::vector :" << endl;
cout << "   Build : " << time_call([&]()
{
    v.resize(ecount/2);
    for (__int64 i = 0; i < ecount; i++)
    {
        if (i < (ecount / 2))
            v[i] = seed[i];
        else
            v.push_back(seed[i]);
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    for (auto& n : v)
        n /= 2;
}) << " ms" << endl;

bool found1, found2, found3;
cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = find(v.begin(), v.end(), seed[5] / 2) != v.end();//find(seed[5]) != m.end();
        found2 = find(v.begin(), v.end(), seed[1000] / 2) != v.end();

        // Shouldn't exist
        found3 = find(v.begin(), v.end(), -1234) != v.end();
    }
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    v.clear();
}) << " ms" << endl;

///////////// std::map

map<__int64, __int64> m;

cout << "--------------------" << endl;
cout << "std::map :" << endl;
cout << "   Build : " << time_call([&]()
{
    for (__int64 i = 0; i < ecount; i++)
    {
        m[seed[i]] = seed[i];
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    for (auto& n : m)
        n.second /= 2;
}) << " ms" << endl;

cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = m.find(seed[5]) != m.end();
        found2 = m.find(seed[1000]) != m.end();

        // Shouldn't exist
        found3 = m.find(-1234) != m.end();
    }
})) / 15.0) / 3.0;
cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    m.clear();
}) << endl;

///////////// Linear Map V0

LinearMap0<__int64> c;

cout << "--------------------" << endl;
cout << "Linear Map V0:" << endl;
cout << "   Build : " << time_call([&]()
{
    c.PreAllocate(ecount / 2);
    for (__int64 i = 0; i < ecount; i++)
    {
        c.AddElement(seed[i],seed[i]);
    }
}) << " ms" << endl;

cout << "   Loop : " << time_call([&]()
{
    c.Loop([](__int64& Data)
    {
        Data /= 2;
    });
}) << " ms" << endl;

cout << "   Find : " << (((float)time_call([&]()
{
    for (int i = 0; i < 15; i++)
    {
        // Should exist
        found1 = c.Find(seed[5]);
        found2 = c.Find(seed[1000]);

        // Shouldn't exist
        found3 = c.Find(-1234);
    }
})) / 15.0) / 3.0;
cout << " ms -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;

cout << "   Release : " << time_call([&]()
{
    c.Release();
}) << endl;
//初始化种子
常数int64 ecount=5*1000*1000;
载体种子(ecount);
对于(_int64 i=0;istd-vector的所有优点都是元素紧凑(内存中的元素1紧跟在元素0之后,依此类推)。这对CPU来说是一个很大的优势,因为内存读取更容易预测。当在堆上分配了节点时,CPU必须疯狂地来回跳跃以获取内存


签出线程。

您的容器是一个链表,而
std::vector
是一个动态大小的数组

链表方法有很多好处,例如能够插入元素而无需重新分配

但是,阵列方法有一些显著的优点:

  • 线性搜索只是扫描内存,这正是缓存和预取器构建的目的。对链表的扫描效率较低,因为每次跳转到未缓存内存都意味着代价高昂的缓存未命中
  • 线性阵列扫描易于矢量化。如果使用
    -O3
    进行编译,则编译器可能会使用
    std::find
    的矢量化版本。由于内存依赖性,无法对链表扫描进行矢量化
  • 使用的内存量。您的链接列表必须维护一个
    next
    指针,该指针有效地使您的
    // Initialize seeds
    const __int64 ecount = 5 * 1000*1000;
    vector<__int64> seed(ecount);
    for (__int64 i = 0; i < ecount; i++)
    {
        seed[i] = i;
    }
    random_shuffle(seed.begin(), seed.end());
    
    ///////////// std::vector
    
    vector<__int64> v;
    
    cout << "--------------------" << endl;
    cout << "std::vector :" << endl;
    cout << "   Build : " << time_call([&]()
    {
        v.resize(ecount/2);
        for (__int64 i = 0; i < ecount; i++)
        {
            if (i < (ecount / 2))
                v[i] = seed[i];
            else
                v.push_back(seed[i]);
        }
    }) << " ms" << endl;
    
    cout << "   Loop : " << time_call([&]()
    {
        for (auto& n : v)
            n /= 2;
    }) << " ms" << endl;
    
    bool found1, found2, found3;
    cout << "   Find : " << (((float)time_call([&]()
    {
        for (int i = 0; i < 15; i++)
        {
            // Should exist
            found1 = find(v.begin(), v.end(), seed[5] / 2) != v.end();//find(seed[5]) != m.end();
            found2 = find(v.begin(), v.end(), seed[1000] / 2) != v.end();
    
            // Shouldn't exist
            found3 = find(v.begin(), v.end(), -1234) != v.end();
        }
    })) / 15.0) / 3.0;
    cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
    
    cout << "   Release : " << time_call([&]()
    {
        v.clear();
    }) << " ms" << endl;
    
    ///////////// std::map
    
    map<__int64, __int64> m;
    
    cout << "--------------------" << endl;
    cout << "std::map :" << endl;
    cout << "   Build : " << time_call([&]()
    {
        for (__int64 i = 0; i < ecount; i++)
        {
            m[seed[i]] = seed[i];
        }
    }) << " ms" << endl;
    
    cout << "   Loop : " << time_call([&]()
    {
        for (auto& n : m)
            n.second /= 2;
    }) << " ms" << endl;
    
    cout << "   Find : " << (((float)time_call([&]()
    {
        for (int i = 0; i < 15; i++)
        {
            // Should exist
            found1 = m.find(seed[5]) != m.end();
            found2 = m.find(seed[1000]) != m.end();
    
            // Shouldn't exist
            found3 = m.find(-1234) != m.end();
        }
    })) / 15.0) / 3.0;
    cout << " ms " << " -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
    
    cout << "   Release : " << time_call([&]()
    {
        m.clear();
    }) << endl;
    
    ///////////// Linear Map V0
    
    LinearMap0<__int64> c;
    
    cout << "--------------------" << endl;
    cout << "Linear Map V0:" << endl;
    cout << "   Build : " << time_call([&]()
    {
        c.PreAllocate(ecount / 2);
        for (__int64 i = 0; i < ecount; i++)
        {
            c.AddElement(seed[i],seed[i]);
        }
    }) << " ms" << endl;
    
    cout << "   Loop : " << time_call([&]()
    {
        c.Loop([](__int64& Data)
        {
            Data /= 2;
        });
    }) << " ms" << endl;
    
    cout << "   Find : " << (((float)time_call([&]()
    {
        for (int i = 0; i < 15; i++)
        {
            // Should exist
            found1 = c.Find(seed[5]);
            found2 = c.Find(seed[1000]);
    
            // Shouldn't exist
            found3 = c.Find(-1234);
        }
    })) / 15.0) / 3.0;
    cout << " ms -> First : " << ((found1) ? "Found" : "Not Found") << ", Second : " << ((found2) ? "Found" : "Not Found") << ", Third : " << ((found3) ? "Found" : "Not Found") << endl;
    
    cout << "   Release : " << time_call([&]()
    {
        c.Release();
    }) << endl;
    
    template <class Function>
    double time_call(Function&& f)
    {
        chrono::time_point<chrono::high_resolution_clock> start, end;
        start = chrono::high_resolution_clock::now();
            f();
        end = chrono::high_resolution_clock::now();
    
        return ((double)(chrono::duration_cast<chrono::nanoseconds>(end - start).count())) / 1000000.0;
    }