C++ MSVC 14中的慢CRTP模式
我正在使用CRTP模式,并注意到MSVC 14(Visual Studio 2015 Update 1)无法优化以下CRTP模式的开销(即使使用-O2)C++ MSVC 14中的慢CRTP模式,c++,c++11,visual-c++,visual-studio-2015,crtp,C++,C++11,Visual C++,Visual Studio 2015,Crtp,我正在使用CRTP模式,并注意到MSVC 14(Visual Studio 2015 Update 1)无法优化以下CRTP模式的开销(即使使用-O2) template <typename T, size_t N, typename Derived> struct Base { __forceinline Derived& operator+=(const Derived &b) { return (static_cast<De
template <typename T, size_t N, typename Derived>
struct Base
{
__forceinline Derived& operator+=(const Derived &b)
{
return (static_cast<Derived&>(*this) = static_cast<Derived&>(*this) + b);
}
};
template <typename T, size_t N>
struct A: Base<T,N, A<T,N>>
{
T data[N];
};
template <typename T, size_t N>
__forceinline A<T, N> operator+(const A<T, N> &a, const A<T, N> &b)
{
A<T, N> result;
// just an unroll helper, because MSVC doesn't unroll small loops
unroll_for<0, N>([](size_t i)
{
result.data[i] = a.data[i] + b.data[i];
});
return result;
}
int main()
{
A <float,4> a;
A <float,4> b;
a += b;
return 0;
};
我从你那里得到了相反的结果。第二圈对我来说慢了4倍。你在制作发行版吗?我看到了编辑。然后问题是-如何测量?使用QueryPerformanceCounter。它单调且分辨率高(至少在现代Windows平台上是如此)。您是否使用了Visual Studio 2015 Update 1?@plasmacel您的两个实现并不等同。如果您到处删除不必要的副本,它们会突然对我执行相同的操作:是的,但通常情况下是相反的-您按照
+=
实现+
,而不是相反。
#include <iostream>
#include <chrono>
#include <string>
template <typename T, size_t N, typename Derived>
struct Base
{
__forceinline Derived& operator+=(const Derived &b)
{
return (static_cast<Derived&>(*this) = static_cast<Derived&>(*this) + b);
}
};
// same as Base, but with manually inlined operator+
template <typename T, size_t N, typename Derived>
struct Base2
{
__forceinline Derived& operator+=(const Derived &b)
{
static_cast<Derived*>(this)->data[0] += b.data[0];
static_cast<Derived*>(this)->data[1] += b.data[1];
static_cast<Derived*>(this)->data[2] += b.data[2];
static_cast<Derived*>(this)->data[3] += b.data[3];
return static_cast<Derived&>(*this);
}
};
template <typename T, size_t N>
struct A: Base<T,N, A<T,N>>
{
T data[N];
};
template <typename T, size_t N>
struct B: Base2<T,N, B<T,N>>
{
T data[N];
};
template <typename T, size_t N>
__forceinline A<T, N> operator+(const A<T, N> &a, const A<T, N> &b)
{
A<T, N> result;
result.data[0] = a.data[0] + b.data[0];
result.data[1] = a.data[1] + b.data[1];
result.data[2] = a.data[2] + b.data[2];
result.data[3] = a.data[3] + b.data[3];
return result;
}
template <class Func>
void runtime_print(Func &&func, size_t n, std::string label)
{
auto t0 = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < n; ++i)
{
func(i);
}
auto t1 = std::chrono::high_resolution_clock::now();
auto t = std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
std::cout << label << " : " << t << '\n';
}
int main()
{
size_t n = 10000000;
A <float,4> a0;
A <float,4> a1;
B <float,4> b0;
B <float,4> b1;
// this is slow
runtime_print([&](size_t i)
{
a0 += a1;
}, n, "a0");
// this is fast
runtime_print([&](size_t i)
{
b0 += b1;
}, n, "b0");
return 0;
};