Java 排序时会出现非常奇怪的效率怪癖
我目前正在学习一门数据结构课程,正如您所料,我们必须做的一件事就是编写一些常见的排序。在编写我的插入排序算法时,我注意到In的运行速度明显快于我的讲师(对于400000个数据点,我的算法大约需要30秒,他的算法大约需要90秒)。我通过电子邮件向他发送了我的代码,当它们都在同一台机器上运行时,同样的结果发生了。我们设法花了40多分钟慢慢地把他的分类方法改成我的,直到一字不差地一模一样,除了一件看似武断的事情。首先,这是我的插入排序代码:Java 排序时会出现非常奇怪的效率怪癖,java,algorithm,performance,sorting,insertion-sort,Java,Algorithm,Performance,Sorting,Insertion Sort,我目前正在学习一门数据结构课程,正如您所料,我们必须做的一件事就是编写一些常见的排序。在编写我的插入排序算法时,我注意到In的运行速度明显快于我的讲师(对于400000个数据点,我的算法大约需要30秒,他的算法大约需要90秒)。我通过电子邮件向他发送了我的代码,当它们都在同一台机器上运行时,同样的结果发生了。我们设法花了40多分钟慢慢地把他的分类方法改成我的,直到一字不差地一模一样,除了一件看似武断的事情。首先,这是我的插入排序代码: public static int[] insertionS
public static int[] insertionSort(int[] A){
//Check for illegal cases
if (A == null || A.length == 0){
throw new IllegalArgumentException("A is not populated");
}
for(int i = 0; i < A.length; i++){
int j = i;
while(j > 0 && A[j - 1] > A[j]){
int temp = A[j];
A[j] = A[j - 1];
A[j - 1] = temp;
j--;
}
}
return A;
}
我们发现这三条线是罪魁祸首。因此,我的代码运行速度明显加快。困惑的是,我们运行了javap-c
来获取一个简单程序的字节码,这个程序只有一个main
,其中包含一个数组声明,一个intj
的变量声明,以及3行代码,以便在我编写和他编写时进行交换。以下是我的交换方法的字节码:
Compiled from "me.java"
public class me {
public me();
Code:
0: aload_0
1: invokespecial #1 // Method java/lang/Object."<init>":()V
4: return
public static void main(java.lang.String[]);
Code:
0: sipush 10000
3: newarray int
5: astore_1
6: bipush 10
8: istore_2
9: aload_1
10: iload_2
11: iaload
12: istore_3
13: aload_1
14: iload_2
15: aload_1
16: iload_2
17: iconst_1
18: isub
19: iaload
20: iastore
21: aload_1
22: iload_2
23: iconst_1
24: isub
25: iload_3
26: iastore
27: return
}
它们作为两个不同JVM中的两个独立文件运行。TL;博士
你的实验无效,有许多变量可能会影响结果。最好使用卡尺或JMH等微基准标记工具。我用这样的工具来检查
你的和你教授的差别可以忽略不计
实验
对于我的实验,我有745038个数据点。我创建了3个测试,你的、你的讲师的版本和作为JDK一部分的Arrays.sort()
根据结果,您的运行时间为:1419867.808 ns
您的指导老师是:1429798.824 ns
所以我们说的是0.01毫秒
教练只是在两次跑步之间的差异较小
JDK Arrays.sort()的速度慢了一个更大的数量级,为1779042.513纳秒,比您的慢了0.300毫秒
下面是我在下面的卡尺中用来做微基准的代码
package net.trajano.caliper.test;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.api.VmOptions;
import com.google.caliper.runner.CaliperMain;
@VmOptions("-XX:-TieredCompilation")
public class SortBenchmark {
public static int[] insertionSort(final int[] A) {
// Check for illegal cases
if (A == null || A.length == 0) {
throw new IllegalArgumentException("A is not populated");
}
for (int i = 0; i < A.length; i++) {
int j = i;
while (j > 0 && A[j - 1] > A[j]) {
final int temp = A[j - 1];
A[j - 1] = A[j];
A[j] = temp;
j--;
}
}
return A;
}
public static int[] insertionSortInstructor(final int[] A) {
// Check for illegal cases
if (A == null || A.length == 0) {
throw new IllegalArgumentException("A is not populated");
}
for (int i = 0; i < A.length; i++) {
int j = i;
while (j > 0 && A[j - 1] > A[j]) {
final int temp = A[j];
A[j] = A[j - 1];
A[j - 1] = temp;
j--;
}
}
return A;
}
@BeforeExperiment
void setUp() throws IOException {
try (final DataInputStream dis = new DataInputStream(
Files.newInputStream(Paths.get("C:/Program Files/iTunes/iTunes.exe")))) {
final List<Integer> list = new ArrayList<Integer>();
while (true) {
try {
list.add(dis.readInt());
} catch (final EOFException e) {
break;
}
}
data = list.stream().mapToInt(i -> i).toArray();
System.out.println("Data size = " + data.length);
}
}
// data to sort
private static int[] data;
@Benchmark
public void insertionSort(final int reps) {
for (int i = 0; i < reps; i++) {
insertionSort(data);
}
}
@Benchmark
public void insertionSortInstructor(final int reps) {
for (int i = 0; i < reps; i++) {
insertionSortInstructor(data);
}
}
@Benchmark
public void jdkSort(final int reps) {
for (int i = 0; i < reps; i++) {
Arrays.sort(data);
}
}
public static void main(final String[] args) {
CaliperMain.main(SortBenchmark.class, args);
}
}
package net.trajano.caliper.test;
导入java.io.DataInputStream;
导入java.io.EOFException;
导入java.io.IOException;
导入java.nio.file.Files;
导入java.nio.file.path;
导入java.util.ArrayList;
导入java.util.array;
导入java.util.List;
实验前导入com.google.caliper.before;
导入com.google.caliper.Benchmark;
导入com.google.caliper.api.VmOptions;
导入com.google.caliper.runner.CaliperMain;
@VmOptions(“-XX:-分层编译”)
公共类分类标号{
公共静态int[]插入排序(最终int[]A){
//检查违法案件
如果(A==null | | A.length==0){
抛出新的IllegalArgumentException(“未填充A”);
}
for(int i=0;i0&&A[j-1]>A[j]){
最终内部温度=A[j-1];
A[j-1]=A[j];
A[j]=温度;
j--;
}
}
返回A;
}
公共静态int[]插入或构造函数(最终int[]A){
//检查违法案件
如果(A==null | | A.length==0){
抛出新的IllegalArgumentException(“未填充A”);
}
for(int i=0;i0&&A[j-1]>A[j]){
最终内部温度=A[j];
A[j]=A[j-1];
A[j-1]=温度;
j--;
}
}
返回A;
}
@实验前
void setUp()引发IOException{
try(final DataInputStream dis=new DataInputStream(
Files.newInputStream(path.get(“C:/Program Files/iTunes/iTunes.exe”)){
最终列表=新的ArrayList();
while(true){
试一试{
list.add(dis.readInt());
}捕获(最终EOFEException e){
打破
}
}
data=list.stream().mapToInt(i->i.toArray();
System.out.println(“数据大小=“+Data.length”);
}
}
//要排序的数据
私有静态int[]数据;
@基准
公共void insertionSort(最终int reps){
对于(int i=0;i
在一边
老实说,我对结果感到惊讶,JDK的速度变慢了。所以我看了一眼。JDK似乎根据阈值使用了三种排序算法(合并排序、小于286个元素的快速排序和小于47个元素的插入排序)
由于我拥有的数据集一开始就相当大,所以合并排序首先进行,它具有O(n)空间复杂性,即数组的第二个副本。因此,可能是额外的堆分配导致了额外的时间。这是循环展开优化以及常见的影响 子表达式消除。根据阵列访问指令的顺序,JIT可以在一种情况下消除冗余负载,但在另一种情况下不能 让我详细解释一下。在这两种情况下,JIT都会展开内部循环的4次迭代 例如,对于您的情况:
while (j > 3) {
if (A[j - 1] > A[j]) {
int temp = A[j];
A[j] = A[j - 1];
A[j - 1] = temp; \
} A[j - 1] loaded immediately after store
if (A[j - 2] > A[j - 1]) { /
int temp = A[j - 1];
A[j - 1] = A[j - 2];
A[j - 2] = temp; \
} A[j - 2] loaded immediately after store
if (A[j - 3] > A[j - 2]) { /
int temp = A[j - 2];
A[j - 2] = A[j - 3];
A[j - 3] = temp; \
} A[j - 3] loaded immediately after store
if (A[j - 4] > A[j - 3]) { /
int temp = A[j - 3];
A[j - 3] = A[j - 4];
A[j - 4] = temp;
}
j -= 4;
}
然后JIT消除了冗余阵列负载,生成的程序集如下所示
0x0000000002d53a70: movslq %r11d,%r10
0x0000000002d53a73: lea 0x0(%rbp,%r10,4),%r10
0x0000000002d53a78: mov 0x10(%r10),%ebx ; ebx = A[j]
0x0000000002d53a7c: mov 0xc(%r10),%r9d ; r9d = A[j - 1]
0x0000000002d53a80: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53a83: jle 0x0000000002d539f3
0x0000000002d53a89: mov %r9d,0x10(%r10) ; A[j] = r9d
0x0000000002d53a8d: mov %ebx,0xc(%r10) ; A[j - 1] = ebx
; }
0x0000000002d53a91: mov 0x8(%r10),%r9d ; r9d = A[j - 2]
0x0000000002d53a95: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53a98: jle 0x0000000002d539f3
0x0000000002d53a9e: mov %r9d,0xc(%r10) ; A[j - 1] = r9d
0x0000000002d53aa2: mov %ebx,0x8(%r10) ; A[j - 2] = ebx
; }
0x0000000002d53aa6: mov 0x4(%r10),%r9d ; r9d = A[j - 3]
0x0000000002d53aaa: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53aad: jle 0x0000000002d539f3
0x0000000002d53ab3: mov %r9d,0x8(%r10) ; A[j - 2] = r9d
0x0000000002d53ab7: mov %ebx,0x4(%r10) ; A[j - 3] = ebx
; }
0x0000000002d53abb: mov (%r10),%r8d ; r8d = A[j - 4]
0x0000000002d53abe: cmp %ebx,%r8d ; if (r8d > ebx) {
0x0000000002d53ac1: jle 0x0000000002d539f3
0x0000000002d53ac7: mov %r8d,0x4(%r10) ; A[j - 3] = r8
0x0000000002d53acb: mov %ebx,(%r10) ; A[j - 4] = ebx
; }
0x0000000002d53ace: add $0xfffffffc,%r11d ; j -= 4
0x0000000002d53ad2: cmp $0x3,%r11d ; while (j > 3)
0x0000000002d53ad6: jg 0x0000000002d53a70
循环展开后,讲师的代码看起来会有所不同:
while (j > 3) {
if (A[j - 1] > A[j]) {
int temp = A[j - 1];
A[j - 1] = A[j];
A[j] = temp; <-- another store instruction between A[j - 1] access
}
if (A[j - 2] > A[j - 1]) {
int temp = A[j - 2];
A[j - 2] = A[j - 1];
A[j - 1] = temp;
}
...
请注意,如果在禁用循环展开优化的情况下运行JVM(-XX:LoopUnrollLimit=0
),则bot的性能
My insertion sort took 37680.0 milliseconds.
Other insertion sort took 86358.0 milliseconds.
package net.trajano.caliper.test;
import java.io.DataInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import com.google.caliper.BeforeExperiment;
import com.google.caliper.Benchmark;
import com.google.caliper.api.VmOptions;
import com.google.caliper.runner.CaliperMain;
@VmOptions("-XX:-TieredCompilation")
public class SortBenchmark {
public static int[] insertionSort(final int[] A) {
// Check for illegal cases
if (A == null || A.length == 0) {
throw new IllegalArgumentException("A is not populated");
}
for (int i = 0; i < A.length; i++) {
int j = i;
while (j > 0 && A[j - 1] > A[j]) {
final int temp = A[j - 1];
A[j - 1] = A[j];
A[j] = temp;
j--;
}
}
return A;
}
public static int[] insertionSortInstructor(final int[] A) {
// Check for illegal cases
if (A == null || A.length == 0) {
throw new IllegalArgumentException("A is not populated");
}
for (int i = 0; i < A.length; i++) {
int j = i;
while (j > 0 && A[j - 1] > A[j]) {
final int temp = A[j];
A[j] = A[j - 1];
A[j - 1] = temp;
j--;
}
}
return A;
}
@BeforeExperiment
void setUp() throws IOException {
try (final DataInputStream dis = new DataInputStream(
Files.newInputStream(Paths.get("C:/Program Files/iTunes/iTunes.exe")))) {
final List<Integer> list = new ArrayList<Integer>();
while (true) {
try {
list.add(dis.readInt());
} catch (final EOFException e) {
break;
}
}
data = list.stream().mapToInt(i -> i).toArray();
System.out.println("Data size = " + data.length);
}
}
// data to sort
private static int[] data;
@Benchmark
public void insertionSort(final int reps) {
for (int i = 0; i < reps; i++) {
insertionSort(data);
}
}
@Benchmark
public void insertionSortInstructor(final int reps) {
for (int i = 0; i < reps; i++) {
insertionSortInstructor(data);
}
}
@Benchmark
public void jdkSort(final int reps) {
for (int i = 0; i < reps; i++) {
Arrays.sort(data);
}
}
public static void main(final String[] args) {
CaliperMain.main(SortBenchmark.class, args);
}
}
while (j > 3) {
if (A[j - 1] > A[j]) {
int temp = A[j];
A[j] = A[j - 1];
A[j - 1] = temp; \
} A[j - 1] loaded immediately after store
if (A[j - 2] > A[j - 1]) { /
int temp = A[j - 1];
A[j - 1] = A[j - 2];
A[j - 2] = temp; \
} A[j - 2] loaded immediately after store
if (A[j - 3] > A[j - 2]) { /
int temp = A[j - 2];
A[j - 2] = A[j - 3];
A[j - 3] = temp; \
} A[j - 3] loaded immediately after store
if (A[j - 4] > A[j - 3]) { /
int temp = A[j - 3];
A[j - 3] = A[j - 4];
A[j - 4] = temp;
}
j -= 4;
}
0x0000000002d53a70: movslq %r11d,%r10
0x0000000002d53a73: lea 0x0(%rbp,%r10,4),%r10
0x0000000002d53a78: mov 0x10(%r10),%ebx ; ebx = A[j]
0x0000000002d53a7c: mov 0xc(%r10),%r9d ; r9d = A[j - 1]
0x0000000002d53a80: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53a83: jle 0x0000000002d539f3
0x0000000002d53a89: mov %r9d,0x10(%r10) ; A[j] = r9d
0x0000000002d53a8d: mov %ebx,0xc(%r10) ; A[j - 1] = ebx
; }
0x0000000002d53a91: mov 0x8(%r10),%r9d ; r9d = A[j - 2]
0x0000000002d53a95: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53a98: jle 0x0000000002d539f3
0x0000000002d53a9e: mov %r9d,0xc(%r10) ; A[j - 1] = r9d
0x0000000002d53aa2: mov %ebx,0x8(%r10) ; A[j - 2] = ebx
; }
0x0000000002d53aa6: mov 0x4(%r10),%r9d ; r9d = A[j - 3]
0x0000000002d53aaa: cmp %ebx,%r9d ; if (r9d > ebx) {
0x0000000002d53aad: jle 0x0000000002d539f3
0x0000000002d53ab3: mov %r9d,0x8(%r10) ; A[j - 2] = r9d
0x0000000002d53ab7: mov %ebx,0x4(%r10) ; A[j - 3] = ebx
; }
0x0000000002d53abb: mov (%r10),%r8d ; r8d = A[j - 4]
0x0000000002d53abe: cmp %ebx,%r8d ; if (r8d > ebx) {
0x0000000002d53ac1: jle 0x0000000002d539f3
0x0000000002d53ac7: mov %r8d,0x4(%r10) ; A[j - 3] = r8
0x0000000002d53acb: mov %ebx,(%r10) ; A[j - 4] = ebx
; }
0x0000000002d53ace: add $0xfffffffc,%r11d ; j -= 4
0x0000000002d53ad2: cmp $0x3,%r11d ; while (j > 3)
0x0000000002d53ad6: jg 0x0000000002d53a70
while (j > 3) {
if (A[j - 1] > A[j]) {
int temp = A[j - 1];
A[j - 1] = A[j];
A[j] = temp; <-- another store instruction between A[j - 1] access
}
if (A[j - 2] > A[j - 1]) {
int temp = A[j - 2];
A[j - 2] = A[j - 1];
A[j - 1] = temp;
}
...
0x0000000002b53a00: cmp %r8d,%r10d ; if (r10d > r8d) {
0x0000000002b53a03: jle 0x0000000002b53973
0x0000000002b53a09: mov %r8d,0xc(%rbx) ; A[j - 1] = r8d
0x0000000002b53a0d: mov %r10d,0x10(%rbx) ; A[j] = r10d
; }
0x0000000002b53a11: mov 0xc(%rbx),%r10d ; r10d = A[j - 1]
0x0000000002b53a15: mov 0x8(%rbx),%r9d ; r9d = A[j - 2]
0x0000000002b53a19: cmp %r10d,%r9d ; if (r9d > r10d) {
0x0000000002b53a1c: jle 0x0000000002b53973
0x0000000002b53a22: mov %r10d,0x8(%rbx) ; A[j - 2] = r10d
0x0000000002b53a26: mov %r9d,0xc(%rbx) ; A[j - 1] = r9d
; }
0x0000000002b53a2a: mov 0x8(%rbx),%r8d ; r8d = A[j - 2]
0x0000000002b53a2e: mov 0x4(%rbx),%r10d ; r10d = A[j - 3]