Performance 了解英特尔CPU上的4K混叠';s

Performance 了解英特尔CPU上的4K混叠';s,performance,x86,intel,cpu-architecture,Performance,X86,Intel,Cpu Architecture,我一直在读关于Intel CPU上地址位6到11的模糊性导致加载/存储重叠导致4K混叠的文章。因此,我正在尝试编写各种简单的测试(在i7-3770k、Win7、64位、VS2017上)来专门解决这个问题,以确保我在实践中理解它 我一直在尝试但未能证明行为的第一个测试是: void test4kaalising1() { typedef float Value;//也尝试使用double const uint32\u t ValueCount=1024; 常数uint32\u t OffsetCo

我一直在读关于Intel CPU上地址位6到11的模糊性导致加载/存储重叠导致4K混叠的文章。因此,我正在尝试编写各种简单的测试(在i7-3770k、Win7、64位、VS2017上)来专门解决这个问题,以确保我在实践中理解它

我一直在尝试但未能证明行为的第一个测试是:

void test4kaalising1()
{
typedef float Value;//也尝试使用double
const uint32\u t ValueCount=1024;
常数uint32\u t OffsetCount=256;
const uint32_t TestCount=512;
值*a=(值*)\u对齐\u malloc(值计数*sizeof(值),4096);
值*b=(值*)\u对齐\u malloc(值计数*sizeof(值),4096);
对于(uint32_t i=0;i
其灵感来源于:

因此,我不太清楚为什么这没有显示出由此产生的计时问题?正如我所想的那样,由于无序执行,在循环迭代过程中会发生向/从不明确地址加载的存储

生成的程序集是:

000000013F2510E4  cpuid  
000000013F2510E6  rdtsc  
000000013F2510E8  shl         rdx,20h  
000000013F2510EC  mov         r9d,200h  
000000013F2510F2  or          rax,rdx  
000000013F2510F5  mov         r10,rax  
000000013F2510F8  nop         dword ptr [rax+rax]  
000000013F251100  lea         ebx,[rsi+1]  
000000013F251103  mov         r8d,80h  
000000013F251109  lea         rdx,[r14+8]  
000000013F25110D  nop         dword ptr [rax]  
000000013F251110  mov         rax,rbx  
000000013F251113  lea         ecx,[rbx-1]  
000000013F251116  and         eax,3FFh  
000000013F25111B  lea         rdx,[rdx+20h]  
000000013F25111F  and         ecx,3FFh  
000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  
000000013F251143  lea         eax,[rbx+1]  
000000013F251146  and         eax,3FFh  
000000013F25114B  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251150  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F251155  vmovss      dword ptr [rdx-20h],xmm2  
000000013F25115A  lea         eax,[rbx+2]  
000000013F25115D  and         eax,3FFh  
000000013F251162  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251167  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F25116C  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F251171  lea         eax,[rbx+3]  
000000013F251174  and         eax,3FFh  
000000013F251179  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F25117E  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F251183  vmovss      dword ptr [rdx-18h],xmm2  
000000013F251188  lea         eax,[rbx+4]  
000000013F25118B  and         eax,3FFh  
000000013F251190  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251195  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F25119A  vmovss      dword ptr [rdx-14h],xmm2  
000000013F25119F  lea         eax,[rbx+5]  
000000013F2511A2  and         eax,3FFh  
000000013F2511A7  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511AC  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F2511B1  lea         eax,[rbx+6]  
000000013F2511B4  add         ebx,8  
000000013F2511B7  vmovss      dword ptr [rdx-10h],xmm2  
000000013F2511BC  and         eax,3FFh  
000000013F2511C1  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511C6  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F2511CB  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F2511D0  sub         r8,1  
000000013F2511D4  jne         Test4KAliasing1+0B0h (013F251110h)  
000000013F2511DA  sub         r9,1  
000000013F2511DE  jne         Test4KAliasing1+0A0h (013F251100h)  
000000013F2511E4  rdtsc  
000000013F5110F6  cpuid  
000000013F5110F8  rdtsc  
000000013F5110FA  shl         rdx,20h  
000000013F5110FE  mov         r8d,200h  
000000013F511104  or          rax,rdx  
000000013F511107  mov         r10,rax  
000000013F51110A  nop         word ptr [rax+rax]  
000000013F511110  lea         ebx,[rbp+1]  
000000013F511113  mov         r9d,100h  
000000013F511119  lea         rdx,[r13+8]  
000000013F51111D  nop         dword ptr [rax]  
000000013F511120  mov         rax,rbx  
000000013F511123  lea         ecx,[rbx-1]  
000000013F511126  and         eax,7FFh  
000000013F51112B  lea         rdx,[rdx+20h]  
000000013F51112F  and         ecx,7FFh  
000000013F511135  vmovss      xmm0,dword ptr [rsi+rcx*4]  
000000013F51113A  vaddss      xmm1,xmm0,dword ptr [rdi+rcx*4]  
000000013F51113F  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F511144  vmovss      dword ptr [rdx-28h],xmm2  
000000013F511149  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51114E  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F511153  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F511158  vmovss      dword ptr [rdx-24h],xmm2  
000000013F51115D  lea         eax,[rbx+1]  
000000013F511160  and         eax,7FFh  
000000013F511165  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51116A  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51116F  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F511174  vmovss      dword ptr [rdx-20h],xmm2  
000000013F511179  lea         eax,[rbx+2]  
000000013F51117C  and         eax,7FFh  
000000013F511181  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F511186  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51118B  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F511190  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F511195  lea         eax,[rbx+3]  
000000013F511198  and         eax,7FFh  
000000013F51119D  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111A2  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111A7  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F5111AC  vmovss      dword ptr [rdx-18h],xmm2  
000000013F5111B1  lea         eax,[rbx+4]  
000000013F5111B4  and         eax,7FFh  
000000013F5111B9  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111BE  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111C3  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F5111C8  vmovss      dword ptr [rdx-14h],xmm2  
000000013F5111CD  lea         eax,[rbx+5]  
000000013F5111D0  and         eax,7FFh  
000000013F5111D5  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111DA  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111DF  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F5111E4  lea         eax,[rbx+6]  
000000013F5111E7  add         ebx,8  
000000013F5111EA  vmovss      dword ptr [rdx-10h],xmm2  
000000013F5111EF  and         eax,7FFh  
000000013F5111F4  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111F9  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111FE  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F511203  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F511208  sub         r9,1  
000000013F51120C  jne         Test4KAliasing2+0C0h (013F511120h)  
000000013F511212  sub         r8,1  
000000013F511216  jne         Test4KAliasing2+0B0h (013F511110h)  
000000013F51121C  rdtsc  
在网络上,我也看到了各种各样的描述,说底部的12位必须匹配这个别名,而在其他地方只有6到11位?由于最低的6位是缓存线中的字节索引,并且所有内容都是基于缓存线的,那么我会认为它只需要6到11位就可以匹配

编辑:

根据彼得斯的回答,我也尝试过:

a[i] *= 1.234f;
b[j] += 4.321f;
这似乎没有显示问题,并产生:

000000013F6C10E8  cpuid  
000000013F6C10EA  rdtsc  
000000013F6C10EC  shl         rdx,20h  
000000013F6C10F0  mov         ebx,200h  
000000013F6C10F5  or          rax,rdx  
000000013F6C10F8  mov         r9,rax  
000000013F6C10FB  nop         dword ptr [rax+rax]  
000000013F6C1100  lea         edx,[rsi+1]  
000000013F6C1103  mov         r8d,80h  
000000013F6C1109  lea         rcx,[r14+8]  
000000013F6C110D  nop         dword ptr [rax]  
000000013F6C1110  vmulss      xmm1,xmm6,dword ptr [rcx-8]  
000000013F6C1115  vmovss      dword ptr [rcx-8],xmm1  
000000013F6C111A  lea         eax,[rdx-1]  
000000013F6C111D  and         eax,3FFh  
000000013F6C1122  lea         rcx,[rcx+20h]  
000000013F6C1126  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C112B  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1130  vmulss      xmm1,xmm6,dword ptr [rcx-24h]  
000000013F6C1135  vmovss      dword ptr [rcx-24h],xmm1  
000000013F6C113A  mov         rax,rdx  
000000013F6C113D  and         eax,3FFh  
000000013F6C1142  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C1147  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C114C  vmulss      xmm0,xmm6,dword ptr [rcx-20h]  
000000013F6C1151  lea         eax,[rdx+1]  
000000013F6C1154  and         eax,3FFh  
000000013F6C1159  vmovss      dword ptr [rcx-20h],xmm0  
000000013F6C115E  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C1163  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1168  vmulss      xmm1,xmm6,dword ptr [rcx-1Ch]  
000000013F6C116D  vmovss      dword ptr [rcx-1Ch],xmm1  
000000013F6C1172  lea         eax,[rdx+2]  
000000013F6C1175  and         eax,3FFh  
000000013F6C117A  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C117F  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C1184  vmulss      xmm1,xmm6,dword ptr [rcx-18h]  
000000013F6C1189  vmovss      dword ptr [rcx-18h],xmm1  
000000013F6C118E  lea         eax,[rdx+3]  
000000013F6C1191  and         eax,3FFh  
000000013F6C1196  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C119B  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11A0  vmulss      xmm1,xmm6,dword ptr [rcx-14h]  
000000013F6C11A5  vmovss      dword ptr [rcx-14h],xmm1  
000000013F6C11AA  lea         eax,[rdx+4]  
000000013F6C11AD  and         eax,3FFh  
000000013F6C11B2  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11B7  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11BC  vmulss      xmm1,xmm6,dword ptr [rcx-10h]  
000000013F6C11C1  lea         eax,[rdx+5]  
000000013F6C11C4  and         eax,3FFh  
000000013F6C11C9  vmovss      dword ptr [rcx-10h],xmm1  
000000013F6C11CE  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11D3  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11D8  vmulss      xmm1,xmm6,dword ptr [rcx-0Ch]  
000000013F6C11DD  lea         eax,[rdx+6]  
000000013F6C11E0  add         edx,8  
000000013F6C11E3  and         eax,3FFh  
000000013F6C11E8  vmovss      dword ptr [rcx-0Ch],xmm1  
000000013F6C11ED  vaddss      xmm1,xmm7,dword ptr [rdi+rax*4]  
000000013F6C11F2  vmovss      dword ptr [rdi+rax*4],xmm1  
000000013F6C11F7  sub         r8,1  
000000013F6C11FB  jne         Test4KAliasing1+0B0h (013F6C1110h)  
000000013F6C1201  sub         rbx,1  
000000013F6C1205  jne         Test4KAliasing1+0A0h (013F6C1100h)  
000000013F6C120B  rdtsc 
同样基于Peter提到的链接问题,我尝试了3个数组:

a[i] += b[j] + c[j];
这似乎也没有问题。生成的代码是:

000000013F2510E4  cpuid  
000000013F2510E6  rdtsc  
000000013F2510E8  shl         rdx,20h  
000000013F2510EC  mov         r9d,200h  
000000013F2510F2  or          rax,rdx  
000000013F2510F5  mov         r10,rax  
000000013F2510F8  nop         dword ptr [rax+rax]  
000000013F251100  lea         ebx,[rsi+1]  
000000013F251103  mov         r8d,80h  
000000013F251109  lea         rdx,[r14+8]  
000000013F25110D  nop         dword ptr [rax]  
000000013F251110  mov         rax,rbx  
000000013F251113  lea         ecx,[rbx-1]  
000000013F251116  and         eax,3FFh  
000000013F25111B  lea         rdx,[rdx+20h]  
000000013F25111F  and         ecx,3FFh  
000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  
000000013F251143  lea         eax,[rbx+1]  
000000013F251146  and         eax,3FFh  
000000013F25114B  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251150  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F251155  vmovss      dword ptr [rdx-20h],xmm2  
000000013F25115A  lea         eax,[rbx+2]  
000000013F25115D  and         eax,3FFh  
000000013F251162  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251167  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F25116C  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F251171  lea         eax,[rbx+3]  
000000013F251174  and         eax,3FFh  
000000013F251179  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F25117E  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F251183  vmovss      dword ptr [rdx-18h],xmm2  
000000013F251188  lea         eax,[rbx+4]  
000000013F25118B  and         eax,3FFh  
000000013F251190  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251195  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F25119A  vmovss      dword ptr [rdx-14h],xmm2  
000000013F25119F  lea         eax,[rbx+5]  
000000013F2511A2  and         eax,3FFh  
000000013F2511A7  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511AC  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F2511B1  lea         eax,[rbx+6]  
000000013F2511B4  add         ebx,8  
000000013F2511B7  vmovss      dword ptr [rdx-10h],xmm2  
000000013F2511BC  and         eax,3FFh  
000000013F2511C1  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F2511C6  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F2511CB  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F2511D0  sub         r8,1  
000000013F2511D4  jne         Test4KAliasing1+0B0h (013F251110h)  
000000013F2511DA  sub         r9,1  
000000013F2511DE  jne         Test4KAliasing1+0A0h (013F251100h)  
000000013F2511E4  rdtsc  
000000013F5110F6  cpuid  
000000013F5110F8  rdtsc  
000000013F5110FA  shl         rdx,20h  
000000013F5110FE  mov         r8d,200h  
000000013F511104  or          rax,rdx  
000000013F511107  mov         r10,rax  
000000013F51110A  nop         word ptr [rax+rax]  
000000013F511110  lea         ebx,[rbp+1]  
000000013F511113  mov         r9d,100h  
000000013F511119  lea         rdx,[r13+8]  
000000013F51111D  nop         dword ptr [rax]  
000000013F511120  mov         rax,rbx  
000000013F511123  lea         ecx,[rbx-1]  
000000013F511126  and         eax,7FFh  
000000013F51112B  lea         rdx,[rdx+20h]  
000000013F51112F  and         ecx,7FFh  
000000013F511135  vmovss      xmm0,dword ptr [rsi+rcx*4]  
000000013F51113A  vaddss      xmm1,xmm0,dword ptr [rdi+rcx*4]  
000000013F51113F  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F511144  vmovss      dword ptr [rdx-28h],xmm2  
000000013F511149  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51114E  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F511153  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F511158  vmovss      dword ptr [rdx-24h],xmm2  
000000013F51115D  lea         eax,[rbx+1]  
000000013F511160  and         eax,7FFh  
000000013F511165  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F51116A  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51116F  vaddss      xmm2,xmm1,dword ptr [rdx-20h]  
000000013F511174  vmovss      dword ptr [rdx-20h],xmm2  
000000013F511179  lea         eax,[rbx+2]  
000000013F51117C  and         eax,7FFh  
000000013F511181  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F511186  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F51118B  vaddss      xmm2,xmm1,dword ptr [rdx-1Ch]  
000000013F511190  vmovss      dword ptr [rdx-1Ch],xmm2  
000000013F511195  lea         eax,[rbx+3]  
000000013F511198  and         eax,7FFh  
000000013F51119D  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111A2  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111A7  vaddss      xmm2,xmm1,dword ptr [rdx-18h]  
000000013F5111AC  vmovss      dword ptr [rdx-18h],xmm2  
000000013F5111B1  lea         eax,[rbx+4]  
000000013F5111B4  and         eax,7FFh  
000000013F5111B9  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111BE  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111C3  vaddss      xmm2,xmm1,dword ptr [rdx-14h]  
000000013F5111C8  vmovss      dword ptr [rdx-14h],xmm2  
000000013F5111CD  lea         eax,[rbx+5]  
000000013F5111D0  and         eax,7FFh  
000000013F5111D5  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111DA  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111DF  vaddss      xmm2,xmm1,dword ptr [rdx-10h]  
000000013F5111E4  lea         eax,[rbx+6]  
000000013F5111E7  add         ebx,8  
000000013F5111EA  vmovss      dword ptr [rdx-10h],xmm2  
000000013F5111EF  and         eax,7FFh  
000000013F5111F4  vmovss      xmm0,dword ptr [rsi+rax*4]  
000000013F5111F9  vaddss      xmm1,xmm0,dword ptr [rdi+rax*4]  
000000013F5111FE  vaddss      xmm2,xmm1,dword ptr [rdx-0Ch]  
000000013F511203  vmovss      dword ptr [rdx-0Ch],xmm2  
000000013F511208  sub         r9,1  
000000013F51120C  jne         Test4KAliasing2+0C0h (013F511120h)  
000000013F511212  sub         r8,1  
000000013F511216  jne         Test4KAliasing2+0B0h (013F511110h)  
000000013F51121C  rdtsc  
继Peter对其答案的评论/更新之后,我尝试:

a[i] *= 1.234f;
b[i] += 4.321f;
这并没有显示出问题所在。注意:我试图改变I的偏移量,j=I+偏移量从零偏移量开始,之前的大多数尝试都是为了看看什么偏移量可以缓解问题,如果我能找到它的话。(由于x86已经生锈,我仍在这里深入了解反汇编以了解地址生成)

典型的正时运行:

a[i] *= 1.234f;
b[i] += 4.321f;
是:

然而:我认为我犯了一个错误,现在我发现:

a[i] *= 1.234f;
b[j] += 4.321f;
现在,典型的计时运行是:

time: 2794      offset: 0
time: 2737      offset: 1
time: 2655      offset: 2
time: 2748      offset: 3
time: 2605      offset: 4
time: 2730      offset: 5
time: 2665      offset: 6
time: 2703      offset: 7
time: 2571      offset: 8
time: 2558      offset: 9
time: 2213      offset: 10
time: 2200      offset: 11
time: 2325      offset: 12
time: 2200      offset: 13
time: 2200      offset: 14
time: 2264      offset: 15
time: 2264      offset: 16
time: 2355      offset: 17
time: 2348      offset: 18
time: 2262      offset: 19
time: 2260      offset: 20
time: 2262      offset: 21
time: 2260      offset: 22
time: 2490      offset: 23
time: 2261      offset: 24
time: 2260      offset: 25
time: 2255      offset: 26
time: 2261      offset: 27
time: 2263      offset: 28
time: 2260      offset: 29
time: 2260      offset: 30
time: 2262      offset: 31
time: 2264      offset: 32
time: 2355      offset: 33
time: 2266      offset: 34
time: 2270      offset: 35
time: 2260      offset: 36
time: 2268      offset: 37
time: 2260      offset: 38
time: 2260      offset: 39
time: 2262      offset: 40
time: 2260      offset: 41
time: 2259      offset: 42
time: 2260      offset: 43
time: 2260      offset: 44
time: 2255      offset: 45
time: 2260      offset: 46
time: 2265      offset: 47
time: 2263      offset: 48
time: 2355      offset: 49
time: 2293      offset: 50
time: 2204      offset: 51
time: 2323      offset: 52
time: 2200      offset: 53
time: 2200      offset: 54
time: 2460      offset: 55
time: 2200      offset: 56

偏移量越大,相差约20%。

底部的12位是位
[11:0]
。第11位是第12位,因为我们从0开始计数

CPU通过字节粒度检测加载/存储别名,而不仅仅是加载是否访问与旧存储相同的缓存线。存储到
array[1]
不会降低
array[2]
的加载速度;这对性能来说真的很糟糕,因为在数组中循环并一次对每个元素进行RMWing是一种非常常见的模式。(没有软件管道在存储位置之前加载多个元素。)

因此,我认为您在这里没有遇到问题,因为您只是从4k页面中的同一偏移量加载后存储到一个位置。如果您执行类似于此简单循环的操作(不需要额外跨步或偏移到另一个额外页面,只需在不同页面中使用两个数组即可。)

(i=0;i{ a[i]*=1.234; b[i]+=4.321;//从我们刚才写的同一偏移量加载,但在另一页中 } 编译器在加载
b
之前将存储到
a
的asm生成,您可能会遇到问题,因为
a
b
相对于4k页面具有相同的对齐方式

(如果编译器证明
A!=b
,则可以在存储之前执行这两种加载,或者在运行执行此操作的循环版本之前发出要检查的代码。如果编译器通过向量宽度乘以展开因子检查重叠,则可以使用自动矢量化和/或展开。)

这并不是一个完美的例子,但是让
b
的加载依赖于
a
的存储应该会使无序执行至少努力隐藏那么多延迟


另一种创建4k别名的简单方法是memcpy,从
src=srcpage
dst=dstpage+16
或其他什么东西,当然srcpage和dstpage都是对齐的。对
dst[i]
的存储类似于
dstpage[i+16]
(以字节为单位,而不是以C元素大小为单位),因此存储
dst[i]
将发生在从
src[i+16]
加载之前(按程序顺序)。当循环到达该
i
值时,负载将被4k别名阻止


请参阅@HadiBrais对包括IvyBridge(如i7-3770k)在内的CPU进行性能分析的示例。

您似乎已经使用选项/O2和/arch:AVX编译了代码。编译器已将内部循环展开8次,您可以看到有8个序列
vmulss/vaddss/vmovss
a
b
数组的地址分别存储在寄存器
rdx
rdi
中。第一条指令加载单个元素并将其乘以常数,第二条指令将结果添加到另一个数组中的对应元素,第三条指令将结果存储到同一位置。考虑两个这样的序列:

000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  
rcx
rax
初始化为zer
000000013F251125  vmulss      xmm1,xmm6,dword ptr [rdi+rcx*4]  
000000013F25112A  vaddss      xmm2,xmm1,dword ptr [rdx-28h]  
000000013F25112F  vmovss      dword ptr [rdx-28h],xmm2  
000000013F251134  vmulss      xmm1,xmm6,dword ptr [rdi+rax*4]  
000000013F251139  vaddss      xmm2,xmm1,dword ptr [rdx-24h]  
000000013F25113E  vmovss      dword ptr [rdx-24h],xmm2  
load 0x2000
load 0x1000
store 0x1000
load 0x2004
load 0x1004
store 0x1004
a[i] *= 1.234f;
b[j] += 4.321f;
load 0x1000
store 0x1000
load 0x2000
store 0x1000