C++ 当移动一个函数及其';如果将其实现为与主文件不同的文件(.hpp和.cpp),则性能会受到很大影响
在我的主文件(带有main函数的文件)中,我有另一个函数:C++ 当移动一个函数及其';如果将其实现为与主文件不同的文件(.hpp和.cpp),则性能会受到很大影响,c++,C++,在我的主文件(带有main函数的文件)中,我有另一个函数: unsigned long generate_random_number() { unsigned long y; static unsigned long mag01[2] = {0x0UL, MATRIX_A}; // mag01[x] = x * MATRIX_A for x=0,1 if (mti >= N) // generate N words at one time {
unsigned long generate_random_number()
{
unsigned long y;
static unsigned long mag01[2] = {0x0UL, MATRIX_A};
// mag01[x] = x * MATRIX_A for x=0,1
if (mti >= N) // generate N words at one time
{
int kk;
if (mti == N+1) // if init_genrand() has not been called
init_genrand(5489UL); // a default initial seed is used
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
}
y = mt[mti++];
// Tempering
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}
myprog.cpp的简化版本。case语句不在main中,而是在另一个函数中。该函数被调用N次,平均而言,stdev是通过套接字发送的
myprog.cpp
int main()
{
switch(hurry_ind)
{
case 0: return generate_random_number() % 19;
break;
case 1: return generate_random_number() % 100;
break;
case 2: return generate_random_number() % 9;
break;
case 3: return generate_random_number() % 914;
break;
case 4: return generate_random_number() % 355;
break;
case 5: return generate_random_number() % 348;
break;
case 6: return generate_random_number() % 65;
break;
}
}
#ifdef SAME_COMPILATION_UNIT
#include "random.cpp"
#else
#include "random.hpp"
#endif
#include <iostream>
#include <chrono>
unsigned long calc(int hurry_ind)
{
switch(hurry_ind)
{
case 0: return generate_random_number() % 19;
case 1: return generate_random_number() % 100;
case 2: return generate_random_number() % 9;
case 3: return generate_random_number() % 914;
case 4: return generate_random_number() % 355;
case 5: return generate_random_number() % 348;
case 6: return generate_random_number() % 65;
}
return 0;
}
int main(int argc, char** argv)
{
int n = argc > 1 ? std::atol(argv[1]) : 0;
int res = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n; ++i)
res += calc(i % 7);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << res << "(" << diff.count() << " s)\n";
}
当编译器在同一源文件(
翻译单元
)中看到两个函数时,它可以创建一个允许优化寄存器的实现
对被调用函数和调用函数的了解对于这种形式的优化至关重要。如果在同一个编译单元中定义函数,编译器可以内联频繁调用的函数并进行一些积极的优化。最有可能的是,这就是所报道的经济放缓的原因 我使用clang和gcc测试了代码。Clang始终提供相同的生产效率(每2000000000个周期12.5275秒),因此我无法重现所描述的行为,但当我将函数标记为
内联时,gcc提供了显著的性能提升(每2000000000个周期8.31秒对10.42秒)。因此,您可以尝试在初始版本(相同的编译单元)中向函数添加\uuuuuu属性(noinline))
。如果它降低了性能,那么根本原因就是内联
我使用的测试程序:
random.hpp
#pragma once
unsigned long generate_random_number();
random.cpp
#define N 17U
#define M 13U
#define MATRIX_A 0x9908B0DFUL
#define UPPER_MASK 0x80000000UL
#define LOWER_MASK 0x7FFFFFFFUL
static unsigned long mt [ N ];
static int mti = N + 1;
void init_genrand ( unsigned long ulSeed )
{
mt [ 0 ]= ulSeed & 0xFFFFFFFFUL;
for ( mti = 1; mti < int(N); mti++ )
{
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt [ mti ] = ( 1812433253UL * ( mt [ mti - 1 ] ^ ( mt [ mti - 1 ] >> 30 ) ) + mti );
mt [ mti ] &= 0xFFFFFFFFUL;
/* for >32 bit machines */
}
}
#ifdef INLINE_THE_FUNCTION
inline
#endif
unsigned long generate_random_number()
{
unsigned long y;
static unsigned long mag01[2] = {0x0UL, MATRIX_A};
// mag01[x] = x * MATRIX_A for x=0,1
if (mti >= int(N)) // generate N words at one time
{
int kk;
if (mti == N+1) // if init_genrand() has not been called
init_genrand(5489UL); // a default initial seed is used
for (kk=0; kk<int(N-M); kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<int(N-1); kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
}
y = mt[mti++];
// Tempering
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}
你能告诉我们更多关于编译和链接标志,特别是优化标志,以及你的编译器是否运行LTO(gcc称之为链接时优化)的信息吗还是等效的?添加Makefile是否增加了4%真的是一个性能严重受损的实例?@TriskalJM它导致程序比我只在一个文件中写入所有内容的替代方案要长约2-3秒。我试图理解为什么以及如何让它恢复到原来的性能。此库将被多次使用,因此它将累积起来。它可能是必需的,您如何称呼它,因此您应该至少提供myprog.cpp的最小部分,足以重现该行为。我怀疑它将函数内联并进行了一些优化。使用\uuuuuuu属性((noinline))
尝试初始变量。它会影响性能吗?有没有办法提供编译器标志以便我可以模拟这种行为?尝试将-flto
添加到clang命令行(这将启用“链接时间优化”)。
#ifdef SAME_COMPILATION_UNIT
#include "random.cpp"
#else
#include "random.hpp"
#endif
#include <iostream>
#include <chrono>
unsigned long calc(int hurry_ind)
{
switch(hurry_ind)
{
case 0: return generate_random_number() % 19;
case 1: return generate_random_number() % 100;
case 2: return generate_random_number() % 9;
case 3: return generate_random_number() % 914;
case 4: return generate_random_number() % 355;
case 5: return generate_random_number() % 348;
case 6: return generate_random_number() % 65;
}
return 0;
}
int main(int argc, char** argv)
{
int n = argc > 1 ? std::atol(argv[1]) : 0;
int res = 0;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n; ++i)
res += calc(i % 7);
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end-start;
std::cout << res << "(" << diff.count() << " s)\n";
}
clang++ -v
clang version 8.0.0 (tags/RELEASE_800/final)
Target: x86_64-w64-windows-gnu
Thread model: posix
gcc -v
Using built-in specs.
COLLECT_GCC=C:\GNU\msys64\mingw64\bin\gcc.exe
COLLECT_LTO_WRAPPER=C:/GNU/msys64/mingw64/bin/../lib/gcc/x86_64-w64-mingw32/8.2.1/lto-wrapper.exe
Target: x86_64-w64-mingw32
Configured with: ../gcc-8-20181214/configure --prefix=/mingw64 --with-local-prefix=/mingw64/local --build=x86_64-w64-mingw32 --host=x86_64-w64-mingw32 --target=x86_64-w64-mingw32 --with-native-system-header-dir=/mingw64/x86_64-w64-mingw32/include --libexecdir=/mingw64/lib --enable-bootstrap --with-arch=x86-64 --with-tune=generic --enable-languages=ada,c,lto,c++,objc,obj-c++,fortran --enable-shared --enable-static --enable-libatomic --enable-threads=posix --enable-graphite --enable-fully-dynamic-string --enable-libstdcxx-filesystem-ts=yes --enable-libstdcxx-time=yes --disable-libstdcxx-pch --disable-libstdcxx-debug --disable-isl-version-check --enable-lto --enable-libgomp --disable-multilib --enable-checking=release --disable-rpath --disable-win32-registry --disable-nls --disable-werror --disable-symvers --with-libiconv --with-system-zlib --with-gmp=/mingw64 --with-mpfr=/mingw64 --with-mpc=/mingw64 --with-isl=/mingw64 --with-pkgversion='Rev1, Built by MSYS2 project' --with-bugurl=https://sourceforge.net/projects/msys2 --with-gnu-as --with-gnu-ld
Thread model: posix
gcc version 8.2.1 20181214 (Rev1, Built by MSYS2 project)