C++ MSVC 14中的慢CRTP模式_C++_C++11_Visual C++_Visual Studio 2015_Crtp

C++ MSVC 14中的慢CRTP模式

c++ c++11 visual-c++ visual-studio-2015

C++ MSVC 14中的慢CRTP模式,c++,c++11,visual-c++,visual-studio-2015,crtp,C++,C++11,Visual C++,Visual Studio 2015,Crtp,我正在使用CRTP模式，并注意到MSVC 14（Visual Studio 2015 Update 1）无法优化以下CRTP模式的开销（即使使用-O2） template <typename T, size_t N, typename Derived> struct Base { __forceinline Derived& operator+=(const Derived &b) { return (static_cast<De

我正在使用CRTP模式，并注意到MSVC 14（Visual Studio 2015 Update 1）无法优化以下CRTP模式的开销（即使使用-O2）

template <typename T, size_t N, typename Derived>
struct Base
{
    __forceinline Derived& operator+=(const Derived &b)
    {
        return (static_cast<Derived&>(*this) = static_cast<Derived&>(*this) + b);
    }
};

template <typename T, size_t N>
struct A: Base<T,N, A<T,N>>
{
    T data[N];
};

template <typename T, size_t N>
__forceinline A<T, N> operator+(const A<T, N> &a, const A<T, N> &b)
{
    A<T, N> result;

    // just an unroll helper, because MSVC doesn't unroll small loops
    unroll_for<0, N>([](size_t i)
    {
        result.data[i] = a.data[i] + b.data[i];
    });

    return result;
}

int main()
{
    A <float,4> a;
    A <float,4> b;

    a += b;

    return 0;
};

我从你那里得到了相反的结果。第二圈对我来说慢了4倍。你在制作发行版吗？我看到了编辑。然后问题是-如何测量？使用QueryPerformanceCounter。它单调且分辨率高（至少在现代Windows平台上是如此）。您是否使用了Visual Studio 2015 Update 1？@plasmacel您的两个实现并不等同。如果您到处删除不必要的副本，它们会突然对我执行相同的操作：是的，但通常情况下是相反的-您按照

+=

实现

，而不是相反。

#include <iostream>
#include <chrono>
#include <string>

template <typename T, size_t N, typename Derived>
struct Base
{
    __forceinline Derived& operator+=(const Derived &b)
    {
        return (static_cast<Derived&>(*this) = static_cast<Derived&>(*this) + b);
    }
};

// same as Base, but with manually inlined operator+
template <typename T, size_t N, typename Derived>
struct Base2
{
    __forceinline Derived& operator+=(const Derived &b)
    {
        static_cast<Derived*>(this)->data[0] += b.data[0];
        static_cast<Derived*>(this)->data[1] += b.data[1];
        static_cast<Derived*>(this)->data[2] += b.data[2];
        static_cast<Derived*>(this)->data[3] += b.data[3];

        return static_cast<Derived&>(*this);
    }
};

template <typename T, size_t N>
struct A: Base<T,N, A<T,N>>
{
    T data[N];
};

template <typename T, size_t N>
struct B: Base2<T,N, B<T,N>>
{
    T data[N];
};

template <typename T, size_t N>
__forceinline A<T, N> operator+(const A<T, N> &a, const A<T, N> &b)
{
    A<T, N> result;

    result.data[0] = a.data[0] + b.data[0];
    result.data[1] = a.data[1] + b.data[1];
    result.data[2] = a.data[2] + b.data[2];
    result.data[3] = a.data[3] + b.data[3];

    return result;
}

template <class Func>
void runtime_print(Func &&func, size_t n, std::string label)
{
    auto t0 = std::chrono::high_resolution_clock::now();
    for (size_t i = 0; i < n; ++i)
    {
        func(i);
    }
    auto t1 = std::chrono::high_resolution_clock::now();

    auto t = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();

    std::cout << label << " : " << t << '\n';
}


int main()
{
    size_t n = 10000000;

    A <float,4> a0;
    A <float,4> a1;

    B <float,4> b0;
    B <float,4> b1;

    // this is slow
    runtime_print([&](size_t i)
    {
        a0 += a1;
    }, n, "a0");

    // this is fast
    runtime_print([&](size_t i)
    {
        b0 += b1;
    }, n, "b0");

    return 0;
};