Java &燃气轮机；vs.>；=冒泡排序导致显著的性能差异_Java_C++_Performance_Optimization

Java &燃气轮机；vs.>；=冒泡排序导致显著的性能差异

java c++ performance optimization

Java &燃气轮机；vs.>；=冒泡排序导致显著的性能差异,java,c++,performance,optimization,Java,C++,Performance,Optimization,我只是偶然发现了一些东西。起初我认为这可能是一个分支预测失误的案例，但我无法解释为什么分支预测失误会导致这种行为我在Java中实现了两个版本的气泡排序，并进行了一些性能测试： import java.util.Random; public class BubbleSortAnnomaly { public static void main(String... args) { final int ARRAY_SIZE = Integer.parseInt(args[0

我只是偶然发现了一些东西。起初我认为这可能是一个分支预测失误的案例，但我无法解释为什么分支预测失误会导致这种行为

我在Java中实现了两个版本的气泡排序，并进行了一些性能测试：

import java.util.Random;

public class BubbleSortAnnomaly {

    public static void main(String... args) {
        final int ARRAY_SIZE = Integer.parseInt(args[0]);
        final int LIMIT = Integer.parseInt(args[1]);
        final int RUNS = Integer.parseInt(args[2]);

        int[] a = new int[ARRAY_SIZE];
        int[] b = new int[ARRAY_SIZE];
        Random r = new Random();
        for (int run = 0; RUNS > run; ++run) {
            for (int i = 0; i < ARRAY_SIZE; i++) {
                a[i] = r.nextInt(LIMIT);
                b[i] = a[i];
            }

            System.out.print("Sorting with sortA: ");
            long start = System.nanoTime();
            int swaps = bubbleSortA(a);

            System.out.println(  (System.nanoTime() - start) + " ns. "
                               + "It used " + swaps + " swaps.");

            System.out.print("Sorting with sortB: ");
            start = System.nanoTime();
            swaps = bubbleSortB(b);

            System.out.println(  (System.nanoTime() - start) + " ns. "
                               + "It used " + swaps + " swaps.");
        }
    }

    public static int bubbleSortA(int[] a) {
        int counter = 0;
        for (int i = a.length - 1; i >= 0; --i) {
            for (int j = 0; j < i; ++j) {
                if (a[j] > a[j + 1]) {
                    swap(a, j, j + 1);
                    ++counter;
                }
            }
        }
        return (counter);
    }

    public static int bubbleSortB(int[] a) {
        int counter = 0;
        for (int i = a.length - 1; i >= 0; --i) {
            for (int j = 0; j < i; ++j) {
                if (a[j] >= a[j + 1]) {
                    swap(a, j, j + 1);
                    ++counter;
                }
            }
        }
        return (counter);
    }

    private static void swap(int[] a, int j, int i) {
        int h = a[i];
        a[i] = a[j];
        a[j] = h;
    }
}

当我将

LIMIT

的参数设置为，例如

（

java BubblesOrtanally 50000 10

）时，我得到了预期的结果：

Sorting with sortA: 3.983 seconds. It used  625941897 swaps.
Sorting with sortB: 4.658 seconds. It used  789391382 swaps.

我将程序移植到C++中，以确定这个问题是否是java特有的。这是C++代码。< /P>

#include <cstdlib>
#include <iostream>

#include <omp.h>

#ifndef ARRAY_SIZE
#define ARRAY_SIZE 50000
#endif

#ifndef LIMIT
#define LIMIT 10
#endif

#ifndef RUNS
#define RUNS 10
#endif

void swap(int * a, int i, int j)
{
    int h = a[i];
    a[i] = a[j];
    a[j] = h;
}

int bubbleSortA(int * a)
{
    const int LAST = ARRAY_SIZE - 1;
    int counter = 0;
    for (int i = LAST; 0 < i; --i)
    {
        for (int j = 0; j < i; ++j)
        {
            int next = j + 1;
            if (a[j] > a[next])
            {
                swap(a, j, next);
                ++counter;
            }
        }
    }
    return (counter);
}

int bubbleSortB(int * a)
{
    const int LAST = ARRAY_SIZE - 1;
    int counter = 0;
    for (int i = LAST; 0 < i; --i)
    {
        for (int j = 0; j < i; ++j)
        {
            int next = j + 1;
            if (a[j] >= a[next])
            {
                swap(a, j, next);
                ++counter;
            }
        }
    }
    return (counter);
}

int main()
{
    int * a = (int *) malloc(ARRAY_SIZE * sizeof(int));
    int * b = (int *) malloc(ARRAY_SIZE * sizeof(int));

    for (int run = 0; RUNS > run; ++run)
    {
        for (int idx = 0; ARRAY_SIZE > idx; ++idx)
        {
            a[idx] = std::rand() % LIMIT;
            b[idx] = a[idx];
        }

        std::cout << "Sorting with sortA: ";
        double start = omp_get_wtime();
        int swaps = bubbleSortA(a);

        std::cout << (omp_get_wtime() - start) << " seconds. It used " << swaps
                  << " swaps." << std::endl;

        std::cout << "Sorting with sortB: ";
        start = omp_get_wtime();
        swaps = bubbleSortB(b);

        std::cout << (omp_get_wtime() - start) << " seconds. It used " << swaps
                  << " swaps." << std::endl;
    }

    free(a);
    free(b);

    return (0);
}

#包括
#包括
#包括
#ifndef数组大小
#定义数组大小为50000
#恩迪夫
#ifndef极限
#定义限制10
#恩迪夫
#ifndef运行
#定义运行10
#恩迪夫
无效交换（int*a，int i，int j）
{
int h=a[i]；
a[i]=a[j]；
a[j]=h；
}
int bubbleSortA（int*a）
{
const int LAST=数组大小-1；
int计数器=0；
for（int i=LAST；0a[next]）
{
互换（a、j、next）；
++计数器；
}
}
}
退货（柜台）；
}
int bubbleSortB（int*a）
{
const int LAST=数组大小-1；
int计数器=0；
for（int i=LAST；0=a[next]）
{
互换（a、j、next）；
++计数器；
}
}
}
退货（柜台）；
}
int main（）
{
int*a=（int*）malloc（数组大小*sizeof（int））；
int*b=（int*）malloc（数组大小*sizeof（int））；
对于（int run=0；RUNS>run；++run）
{
对于（int idx=0；数组大小>idx；++idx）
{
[idx]=std:：rand（）%限制；
b[idx]=a[idx]；
}
我认为这确实可以用分支预测失误来解释
例如，考虑LIMIT=11和sortB
。在外循环的第一次迭代中，它将很快遇到一个等于10的元素。因此它将具有a[j]=10
，因此肯定a[j]
将是=a[next]
，因为没有大于10的元素。因此，它将执行交换，然后在j
中执行一个步骤，结果再次发现a[j]=10
（相同的交换值）。因此它将再次成为a[j]>=a[next]
，如此类推。除了最初的几个比较外，所有比较都是正确的。类似地，它将在外循环的下一次迭代中运行
对于sortA
，情况就不一样了。它将以大致相同的方式开始，偶然发现a[j]=10
，以类似的方式进行一些交换，但仅当它发现a[next]=10
时才进行交换。然后条件将为false，不会进行交换。如此类推：每次它偶然发现a[next]=10，该条件为假，且未进行交换。因此，该条件在11次交换中为真10次（值a[next]
从0到9），11例中有1例为假。分支预测失败并不奇怪。
我认为这确实可能是由于分支预测。如果将交换的数量与内部排序迭代的数量进行比较，您会发现：
限制=10

A=560M交换/1250M环路
B=1250M交换/1250M环路（交换比环路少0.02%）

限额=50000

A=627M交换/1250M环路
B=850M交换/1250M环路

因此，在Limit==10
情况下，交换在99.98%的时间内以B排序执行，这显然有利于分支预测器。在Limit==50000
情况下，交换仅随机命中68%，因此分支预测器的益处较小
编辑2:在大多数情况下，这个答案可能是错误的，当我说上面的一切都是正确的时候，lower仍然是正确的，但是对于大多数处理器架构，lower部分不是正确的，请参阅注释。然而，我会说，理论上仍然可能有一些操作系统/架构上的JVM可以做到这一点，但是JVM可以做到这一点很可能是实现得很差，或者是一个奇怪的架构。而且，这在理论上是可能的，因为大多数可以想象的事情在理论上都是可能的，所以我对最后一部分持保留态度
首先，我对C++没有把握，但我可以谈谈java。
这里有一些代码
public class Example {

    public static boolean less(final int a, final int b) {
        return a < b;
    }

    public static boolean lessOrEqual(final int a, final int b) {
        return a <= b;
    }
}

您会注意到唯一的区别是if_icmpge
（如果比较大于/等于）与if_icmpgt
（如果比较大于）
<>以上是事实，其余的是我对如何编写<代码> IF.ICMPGE>代码>和<>代码> IF.ICMPGT的最佳猜测。我是用汇编语言上的一个大学课程来处理的。为了得到更好的答案，你应该查一下你的JVM是如何处理这些的。我猜C++也可以编译成类似的操作。
编辑：上的文档，如果
<计算机比较数字的方式是减去一个数，检查这个数是不是0，所以当做<代码> a < b>代码>如果从<代码> a <代码>减去<代码> b <代码>，并通过检查值的符号来查看结果是否小于0（C++代码< B-A＜0 < /代码>）。（时间计数已删除）使用perf stat
命令，我得到的结果证实了分支未命中理论
当Limit=10
时，BubbleSortB从分支预测中获益匪浅（未命中率为0.01%），但当Limit=50000
时，分支预测的失败率（未命中率为15.65%）甚至高于BubbleSortA（未命中率分别为12.69%和12.76%）
气泡或A限值=10:
Performance counter stats for './bubbleA.out':

   46670.947364 task-clock                #    0.998 CPUs utilized          
             73 context-switches          #    0.000 M/sec                  
             28 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
117,298,787,242 cycles                    #    2.513 GHz                    
117,471,719,598 instructions              #    1.00  insns per cycle        
 25,104,504,912 branches                  #  537.904 M/sec                  
  3,185,376,029 branch-misses             #   12.69% of all branches        

   46.779031563 seconds time elapsed

Performance counter stats for './bubbleA.out':

   46023.785539 task-clock                #    0.998 CPUs utilized          
             59 context-switches          #    0.000 M/sec                  
              8 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
118,261,821,200 cycles                    #    2.570 GHz                    
119,230,362,230 instructions              #    1.01  insns per cycle        
 25,089,204,844 branches                  #  545.136 M/sec                  
  3,200,514,556 branch-misses             #   12.76% of all branches        

   46.126274884 seconds time elapsed

Performance counter stats for './bubbleB.out':

   26091.323705 task-clock                #    0.998 CPUs utilized          
             28 context-switches          #    0.000 M/sec                  
              2 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
 64,822,368,062 cycles                    #    2.484 GHz                    
137,780,774,165 instructions              #    2.13  insns per cycle        
 25,052,329,633 branches                  #  960.179 M/sec                  
      3,019,138 branch-misses             #    0.01% of all branches        

   26.149447493 seconds time elapsed

Performance counter stats for './bubbleB.out':

   51644.210268 task-clock                #    0.983 CPUs utilized          
          2,138 context-switches          #    0.000 M/sec                  
             69 CPU-migrations            #    0.000 M/sec                  
            378 page-faults               #    0.000 M/sec                  
144,600,738,759 cycles                    #    2.800 GHz                    
124,273,104,207 instructions              #    0.86  insns per cycle        
 25,104,320,436 branches                  #  486.101 M/sec                  
  3,929,572,460 branch-misses             #   15.65% of all branches        

   52.511233236 seconds time elapsed

泡泡啤酒限量=50000:
Performance counter stats for './bubbleA.out':

   46670.947364 task-clock                #    0.998 CPUs utilized          
             73 context-switches          #    0.000 M/sec                  
             28 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
117,298,787,242 cycles                    #    2.513 GHz                    
117,471,719,598 instructions              #    1.00  insns per cycle        
 25,104,504,912 branches                  #  537.904 M/sec                  
  3,185,376,029 branch-misses             #   12.69% of all branches        

   46.779031563 seconds time elapsed

Performance counter stats for './bubbleA.out':

   46023.785539 task-clock                #    0.998 CPUs utilized          
             59 context-switches          #    0.000 M/sec                  
              8 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
118,261,821,200 cycles                    #    2.570 GHz                    
119,230,362,230 instructions              #    1.01  insns per cycle        
 25,089,204,844 branches                  #  545.136 M/sec                  
  3,200,514,556 branch-misses             #   12.76% of all branches        

   46.126274884 seconds time elapsed

Performance counter stats for './bubbleB.out':

   26091.323705 task-clock                #    0.998 CPUs utilized          
             28 context-switches          #    0.000 M/sec                  
              2 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
 64,822,368,062 cycles                    #    2.484 GHz                    
137,780,774,165 instructions              #    2.13  insns per cycle        
 25,052,329,633 branches                  #  960.179 M/sec                  
      3,019,138 branch-misses             #    0.01% of all branches        

   26.149447493 seconds time elapsed

Performance counter stats for './bubbleB.out':

   51644.210268 task-clock                #    0.983 CPUs utilized          
          2,138 context-switches          #    0.000 M/sec                  
             69 CPU-migrations            #    0.000 M/sec                  
            378 page-faults               #    0.000 M/sec                  
144,600,738,759 cycles                    #    2.800 GHz                    
124,273,104,207 instructions              #    0.86  insns per cycle        
 25,104,320,436 branches                  #  486.101 M/sec                  
  3,929,572,460 branch-misses             #   15.65% of all branches        

   52.511233236 seconds time elapsed

气泡或B极限=10:
Performance counter stats for './bubbleA.out':

   46670.947364 task-clock                #    0.998 CPUs utilized          
             73 context-switches          #    0.000 M/sec                  
             28 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
117,298,787,242 cycles                    #    2.513 GHz                    
117,471,719,598 instructions              #    1.00  insns per cycle        
 25,104,504,912 branches                  #  537.904 M/sec                  
  3,185,376,029 branch-misses             #   12.69% of all branches        

   46.779031563 seconds time elapsed

Performance counter stats for './bubbleA.out':

   46023.785539 task-clock                #    0.998 CPUs utilized          
             59 context-switches          #    0.000 M/sec                  
              8 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
118,261,821,200 cycles                    #    2.570 GHz                    
119,230,362,230 instructions              #    1.01  insns per cycle        
 25,089,204,844 branches                  #  545.136 M/sec                  
  3,200,514,556 branch-misses             #   12.76% of all branches        

   46.126274884 seconds time elapsed

Performance counter stats for './bubbleB.out':

   26091.323705 task-clock                #    0.998 CPUs utilized          
             28 context-switches          #    0.000 M/sec                  
              2 CPU-migrations            #    0.000 M/sec                  
            379 page-faults               #    0.000 M/sec                  
 64,822,368,062 cycles                    #    2.484 GHz                    
137,780,774,165 instructions              #    2.13  insns per cycle        
 25,052,329,633 branches                  #  960.179 M/sec                  
      3,019,138 branch-misses             #    0.01% of all branches        

   26.149447493 seconds time elapsed

Performance counter stats for './bubbleB.out':

   51644.210268 task-clock                #    0.983 CPUs utilized          
          2,138 context-switches          #    0.000 M/sec                  
             69 CPU-migrations            #    0.000 M/sec                  
            378 page-faults               #    0.000 M/sec                  
144,600,738,759 cycles                    #    2.800 GHz                    
124,273,104,207 instructions              #    0.86  insns per cycle        
 25,104,320,436 branches                  #  486.101 M/sec                  
  3,929,572,460 branch-misses             #   15.65% of all branches        

   52.511233236 seconds time elapsed

B