Performance x86:长循环携带的依赖链。为什么是13个周期?
我修改了(Agner Fog’s)中的代码,使其更加依赖:Performance x86:长循环携带的依赖链。为什么是13个周期?,performance,optimization,x86,Performance,Optimization,X86,我修改了(Agner Fog’s)中的代码,使其更加依赖: movsd xmm2, [x] movsd xmm1, [one] xorps xmm0, xmm0 mov eax, coeff L1: movsd xmm3, [eax] mulsd xmm3, xmm1 mulsd xmm1, xmm2 addsd xmm1, xmm3 add eax, 8 cmp eax, coeff_end jb L1 现在每次迭
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov eax, coeff
L1:
movsd xmm3, [eax]
mulsd xmm3, xmm1
mulsd xmm1, xmm2
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
现在每次迭代大约需要13个周期,但我不知道为什么会有这么多。
请帮我理解
(更新) 我很抱歉。是的,@Peter Cordes确实是对的——事实上,每次迭代需要9个周期。误会是我自己造成的。我错过了两段类似的代码(指令交换),13个周期的代码如下:
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov eax, coeff
L1:
movsd xmm3, [eax]
mulsd xmm1, xmm2
mulsd xmm3, xmm1
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
对于我来说,它在Core2 E6600上每9c运行一次迭代,预计:
movsd xmm3, [eax] ; independent, depends only on eax
A: mulsd xmm3, xmm1 ; 5c: depends on xmm1:C from last iteration
B: mulsd xmm1, xmm2 ; 5c: depends on xmm1:C from last iteration
C: addsd xmm1, xmm3 ; 3c: depends on xmm1:B from THIS iteration (and xmm3:A from this iteration)
当xmm1:C从迭代i
准备就绪时,下一次迭代可以开始计算:
- A:在5c中生产xmm3:A
- B:在5c中生成xmm1:B(但存在资源冲突;在Core2或IvyBridge中,这些倍数不能同时在同一个周期内开始,只有Haswell和更高版本)
C
运行之前完成。因此循环携带的依赖链是5+3个循环,+1c表示资源冲突,它阻止两个乘法在同一个循环中开始
以预期速度运行的测试代码: 当阵列为8B*128*1024时,这将降低到每11c迭代一次。如果您使用更大的阵列进行测试,而不是围绕您发布的内容使用重复循环,那么这就是为什么您会看到更高的延迟 如果负载延迟到达,CPU将无法“赶上”,因为它会延迟循环承载的依赖链。如果负载只在从循环承载链分叉的依赖链中需要,那么管道可以更容易地吸收偶尔的缓慢负载。因此,一些循环比其他循环对内存延迟更敏感
default REL
%macro IACA_start 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_end 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro
global _start
_start:
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov ecx, 10000
outer_loop:
mov eax, coeff
IACA_start ; outside the loop
ALIGN 32 ; this matters on Core2, .78 insn per cycle vs. 0.63 without
L1:
movsd xmm3, [eax]
mulsd xmm3, xmm1
mulsd xmm1, xmm2
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
IACA_end
dec ecx
jnz outer_loop
;mov eax, 1
;int 0x80 ; exit() for 32bit code
xor edi, edi
mov eax, 231 ; exit_group(0). __NR_exit = 60.
syscall
section .data
x:
one: dq 1.0
section .bss
coeff: resq 24*1024 ; 6 * L1 size. Doesn't run any faster when it fits in L1 (resb)
coeff_end:
实验测试
IACA同意:IvB上每次迭代9c (不计算
ALIGN
中的nop
s):
对于我来说,它在Core2 E6600上每9c运行一次迭代,预计:
movsd xmm3, [eax] ; independent, depends only on eax
A: mulsd xmm3, xmm1 ; 5c: depends on xmm1:C from last iteration
B: mulsd xmm1, xmm2 ; 5c: depends on xmm1:C from last iteration
C: addsd xmm1, xmm3 ; 3c: depends on xmm1:B from THIS iteration (and xmm3:A from this iteration)
当xmm1:C从迭代i
准备就绪时,下一次迭代可以开始计算:
- A:在5c中生产xmm3:A
- B:在5c中生成xmm1:B(但存在资源冲突;在Core2或IvyBridge中,这些倍数不能同时在同一个周期内开始,只有Haswell和更高版本)
C
运行之前完成。因此循环携带的依赖链是5+3个循环,+1c表示资源冲突,它阻止两个乘法在同一个循环中开始
以预期速度运行的测试代码: 当阵列为8B*128*1024时,这将降低到每11c迭代一次。如果您使用更大的阵列进行测试,而不是围绕您发布的内容使用重复循环,那么这就是为什么您会看到更高的延迟 如果负载延迟到达,CPU将无法“赶上”,因为它会延迟循环承载的依赖链。如果负载只在从循环承载链分叉的依赖链中需要,那么管道可以更容易地吸收偶尔的缓慢负载。因此,一些循环比其他循环对内存延迟更敏感
default REL
%macro IACA_start 0
mov ebx, 111
db 0x64, 0x67, 0x90
%endmacro
%macro IACA_end 0
mov ebx, 222
db 0x64, 0x67, 0x90
%endmacro
global _start
_start:
movsd xmm2, [x]
movsd xmm1, [one]
xorps xmm0, xmm0
mov ecx, 10000
outer_loop:
mov eax, coeff
IACA_start ; outside the loop
ALIGN 32 ; this matters on Core2, .78 insn per cycle vs. 0.63 without
L1:
movsd xmm3, [eax]
mulsd xmm3, xmm1
mulsd xmm1, xmm2
addsd xmm1, xmm3
add eax, 8
cmp eax, coeff_end
jb L1
IACA_end
dec ecx
jnz outer_loop
;mov eax, 1
;int 0x80 ; exit() for 32bit code
xor edi, edi
mov eax, 231 ; exit_group(0). __NR_exit = 60.
syscall
section .data
x:
one: dq 1.0
section .bss
coeff: resq 24*1024 ; 6 * L1 size. Doesn't run any faster when it fits in L1 (resb)
coeff_end:
实验测试
IACA同意:IvB上每次迭代9c (不计算
ALIGN
中的nop
s):
使用我在上面的评论中建议的
addsd
更改,-->addsd xmm0,xmm3
,可以对其进行编码以使用全宽寄存器,并且性能是两倍快
松散地:
对于ones
的初始值,它需要是:
double ones[2] = { 1.0, x }
我们需要将x
替换为x2
:
double x2[2] = { x * x, x * x }
如果系数的数量为奇数,则将其填充为零以生成偶数
并且,将指针增量更改为16
这是我得到的测试结果。我做了很多试验,选择了时间最好的试验,并通过100次迭代延长了试验时间。std是C版本,dbl是您的版本,qed是“宽”版本: 这是在i7 920@2.67 GHz上完成的 我认为,如果您获取经过的数字并将其转换,您将看到您的版本比您想象的要快
我提前为切换到AT&T语法而道歉,因为我很难让汇编程序以另一种方式工作。再一次,对不起。另外,我正在使用linux,所以我使用了
RDIRSI
寄存器来传递系数指针。如果你在windows上,ABI是不同的,你必须为此进行调整
我做了一个C版本并将其分解。它实际上与您的代码完全相同,只是稍微重新排列了非xmm指令,我在下面添加了这些指令
我相信我发布了所有的文件,所以如果你愿意,你可以在你的系统上运行它
以下是原始代码:
# xmmloop/dbl.s -- implement using single double
.globl dbl
# dbl -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
dbl:
movsd x(%rip),%xmm2 # get x value
movsd one(%rip),%xmm1 # get ones
xorps %xmm0,%xmm0 # sum = 0
dbl_loop:
movsd (%rdi),%xmm3 # c[i]
add $8,%rdi # increment to next vector element
cmp %rsi,%rdi # done yet?
mulsd %xmm1,%xmm3 # c[i]*x^i
mulsd %xmm2,%xmm1 # x^(i+1)
addsd %xmm3,%xmm0 # sum += c[i]*x^i
jb dbl_loop # no, loop
retq
下面是更改为使用
movapd
等的代码:
# xmmloop/qed.s -- implement using single double
.globl qed
# qed -- compute result using single double
#
# arguments:
# rdi -- pointer to coeff vector
# rsi -- pointer to coeff vector end
qed:
movapd x2(%rip),%xmm2 # get x^2 value
movapd one(%rip),%xmm1 # get [1,x]
xorpd %xmm4,%xmm4 # sum = 0
qed_loop:
movapd (%rdi),%xmm3 # c[i]
add $16,%rdi # increment to next coefficient
cmp %rsi,%rdi # done yet?
mulpd %xmm1,%xmm3 # c[i]*x^i
mulpd %xmm2,%xmm1 # x^(i+2)
addpd %xmm3,%xmm4 # sum += c[i]*x^i
jb qed_loop # no, loop
movapd %xmm4,rtn_loop(%rip) # save intermediate DEBUG
movapd %xmm4,%xmm0 # get lower sum
shufpd $1,%xmm4,%xmm4 # get upper value into lower half
movapd %xmm4,rtn_shuf(%rip) # save intermediate DEBUG
addsd %xmm4,%xmm0 # add upper sum to lower
movapd %xmm0,rtn_add(%rip) # save intermediate DEBUG
retq
下面是代码的C版本:
// xmmloop/std -- compute result using C code
#include <xmmloop.h>
// std -- compute result using C
double
std(const double *cur,const double *ep)
{
double xt;
double xn;
double ci;
double sum;
xt = x[0];
xn = one[0];
sum = 0;
for (; cur < ep; ++cur) {
ci = *cur; // get c[i]
ci *= xn; // c[i]*x^i
xn *= xt; // x^(i+1)
sum += ci; // sum += c[i]*x^i
}
return sum;
}
//xmmloop/std——使用C代码计算结果
#包括
//std——使用C计算结果
双重的
标准(常数双*电流,常数双*ep)
{
双xt;
双xn;
双ci;
双和;
xt=x[0];
xn=一[0];
总和=0;
对于(;cur
以下是我使用的测试程序:
// xmmloop/xmmloop -- test program
#define _XMMLOOP_GLO_
#include <xmmloop.h>
// tvget -- get high precision time
double
tvget(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// timeit -- get best time
void
timeit(fnc_p proc,double *cofptr,double *cofend,const char *tag)
{
double tvbest;
double tvbeg;
double tvdif;
double sum;
sum = 0;
tvbest = 1e9;
for (int trycnt = 1; trycnt <= opt_T; ++trycnt) {
tvbeg = tvget();
for (int iter = 1; iter <= opt_I; ++iter)
sum = proc(cofptr,cofend);
tvdif = tvget();
tvdif -= tvbeg;
if (tvdif < tvbest)
tvbest = tvdif;
}
printf("%s: %.15e (ELAP: %.9f)\n",tag,sum,tvbest);
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
double *cofptr;
double *cofend;
double *cur;
double val;
long rseed;
int cnt;
--argc;
++argv;
rseed = 0;
cnt = 0;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'C':
cp += 2;
cnt = strtol(cp,&cp,10);
break;
case 'R':
cp += 2;
rseed = strtol(cp,&cp,10);
break;
case 'T':
cp += 2;
opt_T = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
case 'I':
cp += 2;
opt_I = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
}
}
if (rseed == 0)
rseed = time(NULL);
srand48(rseed);
printf("R=%ld\n",rseed);
if (cnt == 0)
cnt = 100;
if (cnt & 1)
++cnt;
printf("C=%d\n",cnt);
if (opt_T == 0)
opt_T = 100;
printf("T=%d\n",opt_T);
if (opt_I == 0)
opt_I = 100;
printf("I=%d\n",opt_I);
cofptr = malloc(sizeof(double) * cnt);
cofend = &cofptr[cnt];
val = drand48();
for (; val < 3; val += 1.0);
x[0] = val;
x[1] = val;
DMP(x);
one[0] = 1.0;
one[1] = val;
DMP(one);
val *= val;
x2[0] = val;
x2[1] = val;
DMP(x2);
for (cur = cofptr; cur < cofend; ++cur) {
val = drand48();
val *= 1e3;
*cur = val;
}
timeit(std,cofptr,cofend,"std");
timeit(dbl,cofptr,cofend,"dbl");
timeit(qed,cofptr,cofend,"qed");
DMP(rtn_loop);
DMP(rtn_shuf);
DMP(rtn_add);
return 0;
}
//xmmloop/xmmloop——测试程序
#定义_XMMLOOP_GLO_
#包括
//tvget—获取高精度时间
双重的
tvget(无效)
{
结构timespects;
双秒;
时钟获取时间(时钟实时,&ts);
sec=ts.tv\u nsec;
sec/=1e9;
秒+=ts.tv_秒;
返回秒;
}
//timeit——获得最佳时间
无效的
timeit(fnc_p proc,double*cofftr,double*coffend,const char*tag)
{
双tv最佳;
双tvbe
// xmmloop/std -- compute result using C code
#include <xmmloop.h>
// std -- compute result using C
double
std(const double *cur,const double *ep)
{
double xt;
double xn;
double ci;
double sum;
xt = x[0];
xn = one[0];
sum = 0;
for (; cur < ep; ++cur) {
ci = *cur; // get c[i]
ci *= xn; // c[i]*x^i
xn *= xt; // x^(i+1)
sum += ci; // sum += c[i]*x^i
}
return sum;
}
// xmmloop/xmmloop -- test program
#define _XMMLOOP_GLO_
#include <xmmloop.h>
// tvget -- get high precision time
double
tvget(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
// timeit -- get best time
void
timeit(fnc_p proc,double *cofptr,double *cofend,const char *tag)
{
double tvbest;
double tvbeg;
double tvdif;
double sum;
sum = 0;
tvbest = 1e9;
for (int trycnt = 1; trycnt <= opt_T; ++trycnt) {
tvbeg = tvget();
for (int iter = 1; iter <= opt_I; ++iter)
sum = proc(cofptr,cofend);
tvdif = tvget();
tvdif -= tvbeg;
if (tvdif < tvbest)
tvbest = tvdif;
}
printf("%s: %.15e (ELAP: %.9f)\n",tag,sum,tvbest);
}
// main -- main program
int
main(int argc,char **argv)
{
char *cp;
double *cofptr;
double *cofend;
double *cur;
double val;
long rseed;
int cnt;
--argc;
++argv;
rseed = 0;
cnt = 0;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'C':
cp += 2;
cnt = strtol(cp,&cp,10);
break;
case 'R':
cp += 2;
rseed = strtol(cp,&cp,10);
break;
case 'T':
cp += 2;
opt_T = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
case 'I':
cp += 2;
opt_I = (*cp != 0) ? strtol(cp,&cp,10) : 1;
break;
}
}
if (rseed == 0)
rseed = time(NULL);
srand48(rseed);
printf("R=%ld\n",rseed);
if (cnt == 0)
cnt = 100;
if (cnt & 1)
++cnt;
printf("C=%d\n",cnt);
if (opt_T == 0)
opt_T = 100;
printf("T=%d\n",opt_T);
if (opt_I == 0)
opt_I = 100;
printf("I=%d\n",opt_I);
cofptr = malloc(sizeof(double) * cnt);
cofend = &cofptr[cnt];
val = drand48();
for (; val < 3; val += 1.0);
x[0] = val;
x[1] = val;
DMP(x);
one[0] = 1.0;
one[1] = val;
DMP(one);
val *= val;
x2[0] = val;
x2[1] = val;
DMP(x2);
for (cur = cofptr; cur < cofend; ++cur) {
val = drand48();
val *= 1e3;
*cur = val;
}
timeit(std,cofptr,cofend,"std");
timeit(dbl,cofptr,cofend,"dbl");
timeit(qed,cofptr,cofend,"qed");
DMP(rtn_loop);
DMP(rtn_shuf);
DMP(rtn_add);
return 0;
}
// xmmloop/xmmloop.h -- common control
#ifndef _xmmloop_xmmloop_h_
#define _xmmloop_xmmloop_h_
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef _XMMLOOP_GLO_
#define EXTRN_XMMLOOP /**/
#else
#define EXTRN_XMMLOOP extern
#endif
#define XMMALIGN __attribute__((aligned(16)))
EXTRN_XMMLOOP int opt_T;
EXTRN_XMMLOOP int opt_I;
EXTRN_XMMLOOP double x[2] XMMALIGN;
EXTRN_XMMLOOP double x2[2] XMMALIGN;
EXTRN_XMMLOOP double one[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_loop[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_shuf[2] XMMALIGN;
EXTRN_XMMLOOP double rtn_add[2] XMMALIGN;
#define DMP(_sym) \
printf(#_sym ": %.15e %.15e\n",_sym[0],_sym[1]);
typedef double (*fnc_p)(const double *cofptr,const double *cofend);
double std(const double *cofptr,const double *cofend);
double dbl(const double *cofptr,const double *cofend);
double qed(const double *cofptr,const double *cofend);
#endif