Java C性能和编译选项_Java_C_Performance_Gcc_Assembly

Java C性能和编译选项

java c performance gcc assembly

Java C性能和编译选项,java,c,performance,gcc,assembly,Java,C,Performance,Gcc,Assembly,对于像选择排序这样的简单算法，我有两个类似的实现（java和c++） public interface SortingAlgorithm { public void sort(int[] a); } public class SelectionSort implements SortingAlgorithm { @Override public void sort(int[] a) { for (int i = 0; i < a.length;

对于像选择排序这样的简单算法，我有两个类似的实现（java和c++）

public interface SortingAlgorithm {

    public void sort(int[] a);
}

public class SelectionSort implements SortingAlgorithm {

    @Override
    public void sort(int[] a) {
        for (int i = 0; i < a.length; i++) {
            int lowerElementIndex = i;
            for (int j = i + 1; j < a.length; j++) {
                if (a[j] < a[lowerElementIndex]) {
                    lowerElementIndex = j;
                }
            }
            swap(a, lowerElementIndex, i);
        }
    }

    private void swap(int[] a, int i, int j) {
        if (i == j) {
            return;
        }
        int temp = a[i];
        a[i] = a[j];
        a[j] = temp;
    }
}

公共接口排序算法{
公共无效排序（int[]a）；
}
公共类SelectionSort实现排序算法{
@凌驾
公共无效排序（int[]a）{
for（int i=0；i


c一：
inline void swap(int* a, int i, int j);

void s_sort(int* a, int size) {
  int i;
  for (i = 0; i < size; i++) {
    int lowerElementIndex = i, j;
    for (j = i + 1; j < size; j++) {
      if (a[j] < a[lowerElementIndex]) {
    lowerElementIndex = j;
      }
    }
    swap(a, lowerElementIndex, i);
  }
}

inline void swap(int* a, int i, int j) {
  if (i == j) {
    return;
  }
  int temp = a[i];
  a[i] = a[j];
  a[j] = temp;
}

内联无效交换（int*a，int i，int j）；
空s_排序（整数*a，整数大小）{
int i；
对于（i=0；i

现在，我尝试在一个大数组（100000随机整数）上测试它们。
最初的结果是
java:~17秒（使用oracle jdk/jvm编译和执行）
c:~22秒（使用gcc v4.8编译，无需任何优化）
当然，然后我尝试通过cflags优化我的c版本。
结果如下（我仅报告CFLAG）：
-O1:~18.4
-氧气：~18.4
-O{3-9}:~20.9
现在，我的第一个问题是我应该使用哪个cflac来编译
所以我读了gnu手册中关于优化的内容。
添加-march=native没有帮助。在尝试其他选项一段时间后，我进入了-fprofile arcs选项。将它添加到我的标志中使我的代码在大约11秒内完成测试！
但是，我的文件夹中出现了一些文件：分析的结果。据我所知，我应该将它们与-fbranch概率一起使用，然后重新编译代码。
在约18.5秒内重新编译结果这就是我真正想问的问题。
如果我的程序必须写入文件和收集分析信息，而不是在没有写入文件和收集分析信息的情况下运行速度慢1.5倍，那么它怎么可能运行得这么快呢？
我忘了提到我在一台旧电脑上安装了Intel Celeron@2.8GHz处理器和linux（fedora 20和xfce）。如果您需要有关硬件的其他信息，请询问！；）
编辑：
我用于测试的代码是：
爪哇：
公共类测试{
公共静态void main（字符串[]args）{
int[]a=新int[100000]；
int[]a2=新int[100000]；
for（int i=0；i

c：
#include "insertion_sort.h"
#include "selection_sort.h"
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

int main() {
  int max = 100000, i;
  srand(time(NULL));

  int array[100000], array2[100000];
  for(i=0; i<100000; i+=1) {
    array[i] = rand()%100000;
  }

  memcpy(array2, &array[0], 100000 * sizeof(int));

  clock_t inizio = clock();
  s_sort(array, max);
  clock_t fine = clock();
  float tempoEsecuzione = (float)(fine - inizio) / CLOCKS_PER_SEC;
  printf("Selection: %2.3f\n", tempoEsecuzione);

  inizio = clock();
  i_sort(array2, max);
  fine = clock();
  tempoEsecuzione = (float)(fine - inizio) / CLOCKS_PER_SEC;
  printf("Insertion: %2.3f\n", tempoEsecuzione);
  return 0;
}

#包括“insertion_sort.h”
#包括“选择\排序.h”
#包括
#包括
#包括
#包括
int main（）{
int max=100000，i；
srand（时间（空））；
int数组[100000]，array2[100000]；
对于（i=0；i这是我得到的结果。对于我（gcc 4.6.3）-O3-funroll循环获胜
> gcc -o s s.c                                               
> time ./s                                                   
Elapsed time: 13
./s  13.08s user 0.00s system 99% cpu 13.088 total

> gcc -o s s.c -O1                                           
> time ./s                                                   
Elapsed time: 16
./s  16.02s user 0.00s system 99% cpu 16.042 total

> gcc -o s s.c -O2                                           
> time ./s                                                   
Elapsed time: 16
./s  16.06s user 0.00s system 99% cpu 16.076 total

> gcc -o s s.c -O3                                           
> time ./s                                                   
Elapsed time: 7
./s  7.38s user 0.00s system 99% cpu 7.381 total

> gcc -o s s.c -O3 -funroll-loops                            
> time ./s                                                   
Elapsed time: 6
./s  6.04s user 0.00s system 99% cpu 6.046 total

（注意：“已用时间”行不包括构建测试阵列所花费的时间，但可以忽略不计）。
这不是一个真正的答案，但对于注释来说太长了
您的java基准远远不是最佳的-尤其是，您不允许JVM进行足够的预热。在我的机器上，通过适当的预热，时间会减少50%（4s vs 8s）。我建议的代码（只有SelectionSort）：
publicstaticvoidmain（字符串[]args）{
SelectionSort s=新建SelectionSort（）；
int[]aWarmUp=新int[10]；
int[]a=新int[100000]；
for（int i=0；i

输出：
预热前7.851840908

环路内4.055204123

环路内3.878436395

环路内3.880136077

环路内3.882814287
如果我从交换函数中删除条件，我会在C程序中获得100%的加速（使用gcc-O2构建）。例如：
static inline void swap(int* a, int i, int j) {
  int temp = a[i];
  a[i] = a[j];
  a[j] = temp;
}

生成的代码可能对分支预测或缓存预取非常敏感，因此生成的代码中的细微差异（例如，受不同编译器标志的影响）可能会产生巨大影响
请注意，此程序中-fprofile arcs
的开销很小。您自己的时间测量也不包括写入分析文件，但与5秒或10秒以上的执行时间相比，写入数据所需的时间微不足道
及
public static void main(String[] args) {
    SelectionSort s = new SelectionSort();

    int[] aWarmUp = new int[10];
    int[] a = new int[100000];
    for (int i = 0; i < aWarmUp.length; i++) {
        aWarmUp[i] = (int)(Math.random()*100000);
    }
    for (int i = 0; i < a.length; i++) {
        a[i] = (int)(Math.random()*100000);
    }

    measure(s, a, "Before warmup ");

    for (int i = 0; i < 10000; i++) { //warmup
        s.sort(aWarmUp);
    }


    for (int i = 1; i < 5; i++) {
        System.gc(); //gc before measurement
        //re-fill the array with random numbers
        for (int j = 0; j < a.length; j++) {
            a[j] = (int)(Math.random()*100000);
        }
        measure(s, a, "In loop ");
        System.out.println(a[123]); //use the result
    }
}

private static void measure(SelectionSort s, int[] a, String msg) {
    double start = System.nanoTime();
    s.sort(a);
    double end = System.nanoTime();
    double time = (end-start)/1000000000.0;
    System.out.println(msg + time);
}

static inline void swap(int* a, int i, int j) {
  int temp = a[i];
  a[i] = a[j];
  a[j] = temp;
}

for (j = i + 1; j < size; j++) {
  if (a[j] < a[lowerElementIndex]) {
    lowerElementIndex = j;
}

    cmpl    %esi, %ecx
    jge .L3
    movl    %ecx, %esi
    movslq  %edx, %rdi
.L3:

cmpl    %ecx, %esi
cmovl   %edx, %edi
cmovl   %esi, %ecx