C++ 解释为什么分配第二次会改变性能_C++_Performance_Microbenchmark_Google Benchmark

C++ 解释为什么分配第二次会改变性能

c++ performance

C++ 解释为什么分配第二次会改变性能,c++,performance,microbenchmark,google-benchmark,C++,Performance,Microbenchmark,Google Benchmark,我在测试一些关于密集矩阵乘法的微型基准测试（出于好奇），我注意到一些非常奇怪的性能结果下面是一个简单的工作示例： #include <benchmark/benchmark.h> #include <random> constexpr long long n = 128; struct mat_bench_fixture : public benchmark::Fixture { double *matA, *matB, *matC; mat_bench

我在测试一些关于密集矩阵乘法的微型基准测试（出于好奇），我注意到一些非常奇怪的性能结果

下面是一个简单的工作示例：

#include <benchmark/benchmark.h>

#include <random>

constexpr long long n = 128;

struct mat_bench_fixture : public benchmark::Fixture
{
  double *matA, *matB, *matC;

  mat_bench_fixture()
  {
    matA = new double[n * n];
    matB = new double[n * n];
    matC = new double[n * n];
    benchmark::DoNotOptimize(matA);
    benchmark::DoNotOptimize(matB);
    benchmark::DoNotOptimize(matC);
#if 0
    delete[] matA;
    delete[] matB;
    delete[] matC;
    benchmark::DoNotOptimize(matA);
    benchmark::DoNotOptimize(matB);
    benchmark::DoNotOptimize(matC);
    matA = new double[n * n];
    matB = new double[n * n];
    matC = new double[n * n];
    benchmark::DoNotOptimize(matA);
    benchmark::DoNotOptimize(matB);
    benchmark::DoNotOptimize(matC);
#endif
  }

  ~mat_bench_fixture()
  {
    delete[] matA;
    delete[] matB;
    delete[] matC;
  }

  void SetUp(const benchmark::State& s) override
  {
    // generate random data
    std::mt19937 gen;
    std::uniform_real_distribution<double> dis(0, 1);
    for (double* i = matA; i != matA + n * n; ++i)
    {
      *i = dis(gen);
    }
    for (double* i = matB; i != matB + n * n; ++i)
    {
      *i = dis(gen);
    }
  }
};

BENCHMARK_DEFINE_F(mat_bench_fixture, impl1)(benchmark::State& st)
{
  for (auto _ : st)
  {
    for (long long row = 0; row < n; ++row)
    {
      for (long long col = 0; col < n; ++col)
      {
        matC[row * n + col] = 0;
        for (long long k = 0; k < n; ++k)
        {
          matC[row * n + col] += matA[row * n + k] * matB[k * n + col];
        }
      }
    }
    benchmark::DoNotOptimize(matA);
    benchmark::DoNotOptimize(matB);
    benchmark::DoNotOptimize(matC);
    benchmark::ClobberMemory();
  }
}

BENCHMARK_REGISTER_F(mat_bench_fixture, impl1);

BENCHMARK_MAIN();

以下是未执行重新分配的perf stat的代表性屏幕截图：

Running ./mult_test
Run on (4 X 4200 MHz CPU s)
CPU Caches:
  L1 Data 32K (x4)
  L1 Instruction 32K (x4)
  L2 Unified 256K (x4)
  L3 Unified 6144K (x1)
----------------------------------------------------------------------
Benchmark                               Time           CPU Iterations
----------------------------------------------------------------------
mat_bench_fixture/impl1           2181531 ns    2180896 ns        322
mat_bench_fixture/impl1           2188280 ns    2186860 ns        322
mat_bench_fixture/impl1           2182988 ns    2182150 ns        322
mat_bench_fixture/impl1           2182715 ns    2182025 ns        322
mat_bench_fixture/impl1           2175719 ns    2175653 ns        322
mat_bench_fixture/impl1_mean      2182246 ns    2181517 ns        322
mat_bench_fixture/impl1_median    2182715 ns    2182025 ns        322
mat_bench_fixture/impl1_stddev       4480 ns       4000 ns        322

 Performance counter stats for './mult_test --benchmark_repetitions=5':

       3771.370173      task-clock (msec)         #    0.994 CPUs utilized          
               223      context-switches          #    0.059 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
               242      page-faults               #    0.064 K/sec                  
    15,808,590,474      cycles                    #    4.192 GHz                      (61.31%)
    20,201,201,797      instructions              #    1.28  insn per cycle           (69.04%)
     1,844,097,332      branches                  #  488.973 M/sec                    (69.04%)
           358,319      branch-misses             #    0.02% of all branches          (69.14%)
     7,232,957,363      L1-dcache-loads           # 1917.859 M/sec                    (69.24%)
     3,774,591,187      L1-dcache-load-misses     #   52.19% of all L1-dcache hits    (69.35%)
       558,507,528      LLC-loads                 #  148.091 M/sec                    (69.46%)
            93,136      LLC-load-misses           #    0.02% of all LL-cache hits     (69.47%)
   <not supported>      L1-icache-loads                                             
           736,008      L1-icache-load-misses                                         (69.47%)
     7,242,324,412      dTLB-loads                # 1920.343 M/sec                    (69.34%)
               581      dTLB-load-misses          #    0.00% of all dTLB cache hits   (61.50%)
             1,582      iTLB-loads                #    0.419 K/sec                    (61.39%)
               307      iTLB-load-misses          #   19.41% of all iTLB cache hits   (61.29%)
   <not supported>      L1-dcache-prefetches                                        
   <not supported>      L1-dcache-prefetch-misses                                   

       3.795924436 seconds time elapsed

Running ./mult_test
Run on (4 X 4200 MHz CPU s)
CPU Caches:
  L1 Data 32K (x4)
  L1 Instruction 32K (x4)
  L2 Unified 256K (x4)
  L3 Unified 6144K (x1)
----------------------------------------------------------------------
Benchmark                               Time           CPU Iterations
----------------------------------------------------------------------
mat_bench_fixture/impl1           1862961 ns    1862919 ns        376
mat_bench_fixture/impl1           1861986 ns    1861947 ns        376
mat_bench_fixture/impl1           1860330 ns    1860305 ns        376
mat_bench_fixture/impl1           1859711 ns    1859652 ns        376
mat_bench_fixture/impl1           1863299 ns    1863273 ns        376
mat_bench_fixture/impl1_mean      1861658 ns    1861619 ns        376
mat_bench_fixture/impl1_median    1861986 ns    1861947 ns        376
mat_bench_fixture/impl1_stddev       1585 ns       1591 ns        376

 Performance counter stats for './mult_test --benchmark_repetitions=5':

       3724.287293      task-clock (msec)         #    0.995 CPUs utilized          
                11      context-switches          #    0.003 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
               246      page-faults               #    0.066 K/sec                  
    15,612,924,579      cycles                    #    4.192 GHz                      (61.34%)
    23,344,859,019      instructions              #    1.50  insn per cycle           (69.07%)
     2,130,528,330      branches                  #  572.063 M/sec                    (69.07%)
           331,651      branch-misses             #    0.02% of all branches          (69.08%)
     8,369,233,786      L1-dcache-loads           # 2247.204 M/sec                    (69.18%)
     4,206,241,296      L1-dcache-load-misses     #   50.26% of all L1-dcache hits    (69.29%)
       308,687,646      LLC-loads                 #   82.885 M/sec                    (69.40%)
            94,288      LLC-load-misses           #    0.03% of all LL-cache hits     (69.50%)
   <not supported>      L1-icache-loads                                             
           475,066      L1-icache-load-misses                                         (69.50%)
     8,360,570,315      dTLB-loads                # 2244.878 M/sec                    (69.37%)
               364      dTLB-load-misses          #    0.00% of all dTLB cache hits   (61.53%)
               213      iTLB-loads                #    0.057 K/sec                    (61.42%)
               144      iTLB-load-misses          #   67.61% of all iTLB cache hits   (61.32%)
   <not supported>      L1-dcache-prefetches                                        
   <not supported>      L1-dcache-prefetch-misses                                   

       3.743017809 seconds time elapsed

在我的机器中，我可以通过对指针使用不同的对齐方式来重现您的情况。请尝试以下代码：

mat_bench_fixture() {
    matA = new double[n * n + 256];
    matB = new double[n * n + 256];
    matC = new double[n * n + 256];

    // align pointers to 1024
    matA = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matA) + 1023)&~1023);
    matB = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matB) + 1023)&~1023);
    matC = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matC) + 1023)&~1023);

    // toggle this to toggle alignment offset of matB
    // matB += 2;
}

mat_bench_fixture（）{
matA=新双精度[n*n+256]；
matB=新的双精度[n*n+256]；
matC=新的双精度[n*n+256]；
//将指针与1024对齐
matA=重新解释铸造（（重新解释铸造（matA）+1023）和~1023）；
matB=重新解释铸造（（重新解释铸造（matB）+1023）和~1023）；
matC=重新解释铸造（（重新解释铸造（matC）+1023）和~1023）；
//切换此选项可切换matB的对齐偏移
//matB+=2；
}

如果我在这段代码中切换注释行，我在我的机器上得到了34%的差异

不同的对齐偏移会导致不同的计时。你也可以用其他两个指针来抵消。有时差异更小，有时更大，有时没有变化

这一定是由缓存问题引起的：由于指针的最后一位不同，缓存中会出现不同的冲突模式。而且，由于您的例程占用大量内存（所有数据都不适合一级缓存），缓存性能非常重要。

关于上下文，下面是有关的一些信息。

#include <random>
#include <chrono>
#include <iostream>
#include <cstdlib>

constexpr long long n = 128;
constexpr size_t alignment = 64;

inline void escape(void* p)
{
  asm volatile("" : : "g"(p) : "memory");
}
inline void clobber()
{
  asm volatile("" : : : "memory");
}

struct mat_bench_fixture
{
  double *matA, *matB, *matC;

  mat_bench_fixture()
  {
    matA = (double*) aligned_alloc(alignment, sizeof(double) * n * n);
    matB = (double*) aligned_alloc(alignment, sizeof(double) * n * n);
    matC = (double*) aligned_alloc(alignment, sizeof(double) * n * n);
    escape(matA);
    escape(matB);
    escape(matC);
#if 0
    free(matA);
    free(matB);
    free(matC);
    escape(matA);
    escape(matB);
    escape(matC);
    matA = (double*) aligned_alloc(alignment, sizeof(double) *n * n);
    matB = (double*) aligned_alloc(alignment, sizeof(double) *n * n);
    matC = (double*) aligned_alloc(alignment, sizeof(double) *n * n);
    escape(matA);
    escape(matB);
    escape(matC);
#endif
  }

  ~mat_bench_fixture()
  {
    free(matA);
    free(matB);
    free(matC);
  }

  void SetUp()
  {
    // generate random data
    std::mt19937 gen;
    std::uniform_real_distribution<double> dis(0, 1);
    for (double* i = matA; i != matA + n * n; ++i)
    {
      *i = dis(gen);
    }
    for (double* i = matB; i != matB + n * n; ++i)
    {
      *i = dis(gen);
    }
  }
  void run()
  {
    constexpr int iters = 400;
    std::chrono::high_resolution_clock timer;
    auto start = timer.now();
    for (int i = 0; i < iters; ++i)
    {
      for (long long row = 0; row < n; ++row)
      {
        for (long long col = 0; col < n; ++col)
        {
          matC[row * n + col] = 0;
          for (long long k = 0; k < n; ++k)
          {
            matC[row * n + col] += matA[row * n + k] * matB[k * n + col];
          }
        }
      }
      escape(matA);
      escape(matB);
      escape(matC);
      clobber();
    }
    auto stop = timer.now();
    std::cout << std::chrono::duration_cast<std::chrono::nanoseconds>(
                   stop - start)
                     .count() /
                   iters
              << std::endl;
  }
};

int main()
{
  mat_bench_fixture bench;
  for (int i = 0; i < 5; ++i)
  {
    bench.SetUp();
    bench.run();
  }
}

g++ -o mult_test mult_test.cpp -std=c++14 -O3

mat_bench_fixture() {
    matA = new double[n * n + 256];
    matB = new double[n * n + 256];
    matC = new double[n * n + 256];

    // align pointers to 1024
    matA = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matA) + 1023)&~1023);
    matB = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matB) + 1023)&~1023);
    matC = reinterpret_cast<double*>((reinterpret_cast<unsigned long long>(matC) + 1023)&~1023);

    // toggle this to toggle alignment offset of matB
    // matB += 2;
}