Arrays 如何优化循环中的嵌套索引
我在C中有一个非常简单的循环:Arrays 如何优化循环中的嵌套索引,arrays,nested,Arrays,Nested,我在C中有一个非常简单的循环: for (i=0; i < len; ++i) { beta[index[i]] += d * value[i]; } (i=0;i
for (i=0; i < len; ++i) {
beta[index[i]] += d * value[i];
}
(i=0;i{
β[指数[i]+=d*值[i];
}
在这个循环中,beta和value是双数组,而index是整数数组。beta本身可以是一个很长的数组(可能有数百万个元素),但len通常要短得多,比如说,是beta长度的5%。当然,所有阵列都是相互独立的。我们还可以假设索引中没有两个条目是相同的。让我烦恼的是,无论我做什么,似乎都无济于事。到目前为止,我已经尝试使用restrict关键字,指定#pragma ivdep、手动展开、预取(尽管我可能在没有正确的展开因子/预取前瞻的情况下应用了最后两个),甚至尝试使用mkl首先收集要更新的值,使用daxpy进行更新,然后分散结果
有什么建议可以这样做,使循环尽可能快?我的平台是英特尔linux
谢谢,
--拉西
以下是完整的代码:
#include <stdlib.h>
#include <math.h>
int main(int argc, char *argv[])
{
int betasize = atoi(argv[1]);
int len = atoi(argv[2]);
double *beta = calloc(betasize, sizeof(double));
double *value = malloc(len*sizeof(double));
int *index = malloc(len*sizeof(int));
int i;
const double d = 2.5;
/* randomly pick len entries */
for (i=0; i < len; ++i) {
while (1) {
const int ind = floor(drand48()*betasize);
if (beta[ind] == 0) {
value[i] = drand48();
index[i] = ind;
beta[ind] = 1;
break;
}
}
}
/* Now the loop to be optimized */
for (i=0; i < len; ++i) {
beta[index[i]] += d * value[i];
}
}
#包括
#包括
int main(int argc,char*argv[])
{
int betasize=atoi(argv[1]);
int len=atoi(argv[2]);
double*beta=calloc(betasize,sizeof(double));
double*value=malloc(len*sizeof(double));
int*index=malloc(len*sizeof(int));
int i;
常数双d=2.5;
/*随机选取len条目*/
对于(i=0;i
在要优化的循环中放置您最喜欢的时间度量值,并将其作为“a.out 100000000 1000000”运行。您也可以使用更小的阵列,只是这样更难计时。另外,请注意,随着第二个数字越来越接近第一个数字,随机索引生成的速度将会减慢:-)。。。但在我的用例中,第二个数字通常不超过第一个数字的1%,很少超过5%。好的,我已经在您的循环中运行了基准测试。我将它移动到一个单独的函数,并创建了10个类似的函数,每个函数都有细微的变化。您的原始版本是alg_0a——参考 其他的是移除某物的各种组合(例如,使用固定数字代替
值[i]
)。其概念是,当您删除某些内容[对于真正的算法来说仍然是必需的]并且删除内容会显著提高性能时,删除的内容就是性能瓶颈[“热点”]。要查看的算法是alg\u 0*
。查看每个循环中的循环,看看遗漏了什么
在alg_1*
中,我试图通过将索引
和值
数组组合成一个结构来获得更好的缓存性能/局部性。这实际上没有什么影响,因为索引/值
是按顺序访问的,所以它们具有良好的缓存性能[即使是分离的]并且结构无法在这方面改进。因此,您可以跳过alg_1*
数据
表现最差的[因为它做的工作最多]是您的原始alg_0a
。稍微好一点的是alg_0e
,它从索引
和值
数组中获取,但按顺序存储到beta
数组中。更好的是alg\u 0d
,它做的和alg\u 0e
做的一样,只是它以固定的索引存储到beta
唯一真正重要的是写入beta
时的访问模式。由于索引
数组中有用于beta
访问的随机索引,这会导致beta
数组的缓存性能较差
索引
数组中的实际测试数据[随机]扭曲[和/或使]结果无效。在实际程序中,如果数组中确实包含半随机索引,则称为“热点”
是否有更好的模型来生成更具代表性且可能对缓存更友好的索引
数组?也就是说,可能真正的访问更像1,2,3,4 99100101102 600060016002…
如果不知道值
和指数
是如何创建的以及何时创建的,就很难进行推测。value
和index
是否总是同时创建
从某种意义上说,通过索引/值
循环形成一个更新的“时间表”,应用于beta
。如果可以按照beta
索引对该计划进行排序,则这可能会提供更连续的beta
访问
beta
数组大于index/value
。beta
的不同实现是否会更好(例如链表
或稀疏矩阵
)?也就是说,我们希望尽可能使beta
访问对缓存友好
通常,我可能建议使用openmp
来并行化此操作。但是,从我运行的测试来看,这没有什么帮助,因为随机访问模式以及测试似乎显示循环是内存/缓存绑定的
,而不是CPU/计算绑定的
现在我想起来了,这看起来像是散点图或蒙特卡罗算法
“颠倒”这个问题可能是有益的。下面是一个片段:
for (betaidx = 0; betaidx < betasize; ++betaidx) {
if (beta_needs_change(betaidx))
beta[betaidx] = beta_new_value(betaidx);
}
现在,beta
必须以完全不同的方式读取/使用<代码>值可能必须以不同的方式构建。但是,如果您可以将程序CPU使用率的40%减少6倍,那么重新编码来实现这一点可能是值得的。作为一个积极的副作用,程序的其余部分也可能从reorg中受益
这是m
for (idx = 0; idx < len; ++idx)
beta[idx] += d * value[idx];
#include <stdlib.h>
#include <math.h>
#define _BETALOOP_GLO_
#include <ovrlib/ovrlib.h>
#include <betaloop/bncdef.h>
#define FREEME(_ptr) \
do { \
if (_ptr != NULL) \
free(_ptr); \
_ptr = NULL; \
} while (0)
typedef int betaidx_t;
typedef int validx_t;
typedef struct {
double value;
betaidx_t index;
double *beta;
} pair_t;
typedef struct {
betaidx_t betasize; // range value for beta
validx_t vlen; // length of value array
double *beta;
double *value;
betaidx_t *index;
pair_t *pair;
} ctl_t;
// datagen_0a -- randomly pick len entries
void
datagen_0a(ctl_t *ctl,int betasize,int len)
{
double *beta;
double *value;
betaidx_t *index;
validx_t i;
betaidx_t ind;
memset(ctl,0,sizeof(ctl_t));
ctl->betasize = betasize;
ctl->vlen = len;
beta = calloc(betasize,sizeof(double));
ctl->beta = beta;
value = malloc(len * sizeof(double));
ctl->value = value;
index = malloc(len * sizeof(int));
ctl->index = index;
BNCBEG(datagen_0a);
for (i = 0; i < len; ++i) {
while (1) {
ind = floor(drand48() * betasize);
ind %= betasize;
if (beta[ind] == 0) {
value[i] = drand48();
index[i] = ind;
beta[ind] = 1;
break;
}
}
}
BNCEND(datagen_0a);
}
// datagen_0b -- randomly pick len entries
void
datagen_0b(ctl_t *ctl,betaidx_t betasize,int len)
{
double *beta;
double *value;
double curval;
betaidx_t *index;
byte *btv;
pair_t *pair;
validx_t validx;
betaidx_t betaidx;
memset(ctl,0,sizeof(ctl_t));
ctl->betasize = betasize;
ctl->vlen = len;
beta = calloc(betasize,sizeof(double));
ctl->beta = beta;
value = malloc(len * sizeof(double));
ctl->value = value;
index = malloc(len * sizeof(int));
ctl->index = index;
pair = malloc(len * sizeof(pair_t));
ctl->pair = pair;
btv = calloc(BTVSIZE(betasize),sizeof(byte));
BNCBEG(datagen_0b);
for (validx = 0; validx < len; ++validx) {
while (1) {
betaidx = floor(drand48() * betasize);
betaidx %= betasize;
if (! BTVTST(btv,betaidx)) {
BTVSET(btv,betaidx);
curval = drand48();
value[validx] = drand48();
index[validx] = betaidx;
if (pair != NULL) {
pair[validx].value = curval;
pair[validx].index = betaidx;
pair[validx].beta = &beta[betaidx];
}
beta[betaidx] = 1;
break;
}
}
}
BNCEND(datagen_0b);
free(btv);
}
// datarls_0 -- release allocated memory
void
datarls_0(ctl_t *ctl)
{
FREEME(ctl->beta);
FREEME(ctl->value);
FREEME(ctl->index);
FREEME(ctl->pair);
}
// fixed_index -- get fixed beta index
betaidx_t
fixed_index(ctl_t *ctl)
{
betaidx_t index;
while (1) {
index = floor(drand48() * ctl->betasize);
index %= ctl->betasize;
if ((index | 1) < ctl->betasize)
break;
}
return index;
}
// alg_0a -- Now the loop to be optimized
void
alg_0a(ctl_t *ctl)
{
double *beta;
double *value;
betaidx_t *index;
validx_t validx;
validx_t len;
const double d = 2.5;
BNCBEG(alg_0a);
beta = ctl->beta;
value = ctl->value;
index = ctl->index;
len = ctl->vlen;
for (validx = 0; validx < len; ++validx)
beta[index[validx]] += d * value[validx];
BNCEND(alg_0a);
}
// alg_0b -- null destination
double
alg_0b(ctl_t *ctl)
{
double beta;
double *value;
validx_t validx;
validx_t len;
const double d = 2.5;
BNCBEG(alg_0b -- betanull);
beta = 0.0;
value = ctl->value;
len = ctl->vlen;
for (validx = 0; validx < len; ++validx)
beta += d * value[validx];
BNCEND(alg_0b);
return beta;
}
// alg_0c -- fixed destination
void
alg_0c(ctl_t *ctl)
{
double *beta;
double *value;
betaidx_t index;
validx_t validx;
validx_t len;
const double d = 2.5;
index = fixed_index(ctl);
BNCBEG(alg_0c -- betafixed);
beta = ctl->beta;
value = ctl->value;
len = ctl->vlen;
for (validx = 0; validx < len; ++validx, index ^= 1)
beta[index] += d * value[validx];
BNCEND(alg_0c);
}
// alg_0d -- fixed destination with index array fetch
betaidx_t
alg_0d(ctl_t *ctl)
{
double *beta;
double *value;
betaidx_t *idxptr;
betaidx_t index;
validx_t validx;
validx_t len;
const double d = 2.5;
betaidx_t totidx;
index = fixed_index(ctl);
BNCBEG(alg_0d -- beta_fixed_index);
beta = ctl->beta;
value = ctl->value;
idxptr = ctl->index;
len = ctl->vlen;
totidx = 0;
for (validx = 0; validx < len; ++validx, index ^= 1) {
totidx += idxptr[validx];
beta[index] += d * value[validx];
}
BNCEND(alg_0d);
return totidx;
}
// alg_0e -- sequential destination with index array fetch
betaidx_t
alg_0e(ctl_t *ctl)
{
double *beta;
double *value;
betaidx_t *idxptr;
betaidx_t index;
validx_t validx;
validx_t len;
const double d = 2.5;
betaidx_t totidx;
BNCBEG(alg_0e -- beta_seq_index);
index = 0;
beta = ctl->beta;
value = ctl->value;
idxptr = ctl->index;
len = ctl->vlen;
totidx = 0;
for (validx = 0; validx < len; ++validx) {
totidx += idxptr[validx];
beta[index] += d * value[validx];
index = (index + 1) % ctl->betasize;
}
BNCEND(alg_0e);
return totidx;
}
// alg_0f -- null source
void
alg_0f(ctl_t *ctl)
{
double *beta;
double value;
betaidx_t *index;
validx_t validx;
validx_t len;
const double d = 2.5;
value = drand48();
BNCBEG(alg_0f -- nullsrc);
beta = ctl->beta;
index = ctl->index;
len = ctl->vlen;
for (validx = 0; validx < len; ++validx)
beta[index[validx]] += d * value;
BNCEND(alg_0f);
}
// alg_1a -- use pair struct with index
void
alg_1a(ctl_t *ctl)
{
double *beta;
validx_t validx;
validx_t len;
const pair_t *pair;
const double d = 2.5;
BNCBEG(alg_1a -- pair);
beta = ctl->beta;
len = ctl->vlen;
pair = ctl->pair;
for (validx = 0; validx < len; ++validx, ++pair)
beta[pair->index] += d * pair->value;
BNCEND(alg_1a);
}
// alg_1b -- use pair struct with epair
void
alg_1b(ctl_t *ctl)
{
double *beta;
const pair_t *pair;
const pair_t *epair;
const double d = 2.5;
BNCBEG(alg_1b -- epair);
beta = ctl->beta;
pair = ctl->pair;
epair = pair + ctl->vlen;
for (; pair < epair; ++pair)
beta[pair->index] += d * pair->value;
BNCEND(alg_1b);
}
// alg_1c -- use pair struct, epair, and beta pointer
void
alg_1c(ctl_t *ctl)
{
const pair_t *pair;
const pair_t *epair;
const double d = 2.5;
BNCBEG(alg_1c -- betap);
pair = ctl->pair;
epair = pair + ctl->vlen;
for (; pair < epair; ++pair)
*pair->beta += d * pair->value;
BNCEND(alg_1c);
}
// alg_1d -- fixed destination with index array fetch
betaidx_t
alg_1d(ctl_t *ctl)
{
double *beta;
const pair_t *pair;
const pair_t *epair;
const double d = 2.5;
betaidx_t index;
betaidx_t totidx;
index = fixed_index(ctl);
BNCBEG(alg_1d -- beta_fixed_index);
beta = ctl->beta;
pair = ctl->pair;
epair = pair + ctl->vlen;
totidx = 0;
for (; pair < epair; ++pair, index ^= 1) {
totidx += pair->index;
beta[index] += d * pair->value;
}
BNCEND(alg_1d);
return totidx;
}
// alg_1e -- sequential destination with index array fetch
betaidx_t
alg_1e(ctl_t *ctl)
{
double *beta;
const pair_t *pair;
const pair_t *epair;
const double d = 2.5;
betaidx_t index;
betaidx_t totidx;
BNCBEG(alg_1e -- beta_seq_index);
beta = ctl->beta;
pair = ctl->pair;
epair = pair + ctl->vlen;
totidx = 0;
index = 0;
for (; pair < epair; ++pair) {
totidx += pair->index;
beta[index] += d * pair->value;
index = (index + 1) % ctl->betasize;
}
BNCEND(alg_1e);
return totidx;
}
// dotest -- do test
void
dotest(int betasize,int len)
{
ctl_t ctl;
int tryidx;
printf("\n");
printf("dotest: %d %d\n",betasize,len);
BNCBEG(dotest);
#if 0
datagen_0a(&ctl,betasize,len);
#endif
#if 1
datagen_0b(&ctl,betasize,len);
#endif
for (tryidx = 1; tryidx <= 3; ++tryidx) {
alg_0a(&ctl);
alg_0b(&ctl);
alg_0c(&ctl);
alg_0d(&ctl);
alg_0e(&ctl);
alg_0f(&ctl);
alg_1a(&ctl);
alg_1b(&ctl);
alg_1c(&ctl);
alg_1d(&ctl);
alg_1e(&ctl);
}
datarls_0(&ctl);
BNCEND(dotest);
bncdmpa("dotest",1);
}
// main -- main program
int
main(int argc,char **argv)
{
--argc;
++argv;
bncatt(betaloop_bnc);
dotest(100000000,1000000);
dotest(500000000,5000000);
dotest(1000000000,10000000);
return 0;
}
17:39:35.550606012 NEWDAY 11/18/15
17:39:35.550606012 ph: starting 13162 ...
17:39:35.551221132 ph: ARGV ovrgo ...
17:39:36 ovrgo: SDIR /home/cae/preserve/ovrbnc/betaloop
17:39:37 ovrgo: /home/cae/preserve/ovrstk/gen/betaloop/betaloop
bnctst: BEST bncmin=21 bncmax=-1 skipcnt=1000
bnctst: AVG tot=0.000000000
bnctst: SKP tot=0.000023607 avg=0.000000023 cnt=1,000
bnctst: BNCDMP min=0.000000000 max=0.000000000
dotest: 100000000 1000000
dotest: BNCDMP alg_0a tot=0.087135098 avg=0.029045032 cnt=3
dotest: BNCDMP min=0.028753797 max=0.029378731
dotest: BNCDMP alg_0b -- betanull tot=0.003669541 avg=0.001223180 cnt=3
dotest: BNCDMP min=0.001210105 max=0.001242112
dotest: BNCDMP alg_0c -- betafixed tot=0.005472318 avg=0.001824106 cnt=3
dotest: BNCDMP min=0.001815115 max=0.001830939
dotest: BNCDMP alg_0d -- beta_fixed_index tot=0.005654055 avg=0.001884685 cnt=3
dotest: BNCDMP min=0.001883760 max=0.001885919
dotest: BNCDMP alg_0e -- beta_seq_index tot=0.025247095 avg=0.008415698 cnt=3
dotest: BNCDMP min=0.008410631 max=0.008423921
dotest: BNCDMP alg_0f -- nullsrc tot=0.085769224 avg=0.028589741 cnt=3
dotest: BNCDMP min=0.028477846 max=0.028683057
dotest: BNCDMP alg_1a -- pair tot=0.090740003 avg=0.030246667 cnt=3
dotest: BNCDMP min=0.030003776 max=0.030385588
dotest: BNCDMP alg_1b -- epair tot=0.093591309 avg=0.031197103 cnt=3
dotest: BNCDMP min=0.030324733 max=0.032524565
dotest: BNCDMP alg_1c -- betap tot=0.091931228 avg=0.030643742 cnt=3
dotest: BNCDMP min=0.030357306 max=0.031191412
dotest: BNCDMP alg_1d -- beta_fixed_index tot=0.007939126 avg=0.002646375 cnt=3
dotest: BNCDMP min=0.002508210 max=0.002853244
dotest: BNCDMP alg_1e -- beta_seq_index tot=0.025939159 avg=0.008646386 cnt=3
dotest: BNCDMP min=0.008606238 max=0.008683529
dotest: BNCDMP datagen_0b tot=0.136931619
dotest: BNCDMP dotest tot=0.956365745
dotest: 500000000 5000000
dotest: BNCDMP alg_0a tot=0.737332506 avg=0.245777502 cnt=3
dotest: BNCDMP min=0.244778177 max=0.247548555
dotest: BNCDMP alg_0b -- betanull tot=0.018095312 avg=0.006031770 cnt=3
dotest: BNCDMP min=0.005912708 max=0.006225743
dotest: BNCDMP alg_0c -- betafixed tot=0.028059365 avg=0.009353121 cnt=3
dotest: BNCDMP min=0.009303443 max=0.009407530
dotest: BNCDMP alg_0d -- beta_fixed_index tot=0.029024875 avg=0.009674958 cnt=3
dotest: BNCDMP min=0.009550901 max=0.009752188
dotest: BNCDMP alg_0e -- beta_seq_index tot=0.127149609 avg=0.042383203 cnt=3
dotest: BNCDMP min=0.042218860 max=0.042529218
dotest: BNCDMP alg_0f -- nullsrc tot=0.724878907 avg=0.241626302 cnt=3
dotest: BNCDMP min=0.240794352 max=0.242174302
dotest: BNCDMP alg_1a -- pair tot=0.764044535 avg=0.254681511 cnt=3
dotest: BNCDMP min=0.253329522 max=0.256864373
dotest: BNCDMP alg_1b -- epair tot=0.769463084 avg=0.256487694 cnt=3
dotest: BNCDMP min=0.254830714 max=0.258763409
dotest: BNCDMP alg_1c -- betap tot=0.765345462 avg=0.255115154 cnt=3
dotest: BNCDMP min=0.254364352 max=0.256134647
dotest: BNCDMP alg_1d -- beta_fixed_index tot=0.039104441 avg=0.013034813 cnt=3
dotest: BNCDMP min=0.012103513 max=0.014354033
dotest: BNCDMP alg_1e -- beta_seq_index tot=0.130221038 avg=0.043407012 cnt=3
dotest: BNCDMP min=0.043143231 max=0.043752516
dotest: BNCDMP datagen_0b tot=2.060880641
dotest: BNCDMP dotest tot=6.611719277
dotest: 1000000000 10000000
dotest: BNCDMP alg_0a tot=1.726930574 avg=0.575643524 cnt=3
dotest: BNCDMP min=0.575218786 max=0.576291884
dotest: BNCDMP alg_0b -- betanull tot=0.035615393 avg=0.011871797 cnt=3
dotest: BNCDMP min=0.011820026 max=0.011948646
dotest: BNCDMP alg_0c -- betafixed tot=0.056452922 avg=0.018817640 cnt=3
dotest: BNCDMP min=0.018590739 max=0.019195537
dotest: BNCDMP alg_0d -- beta_fixed_index tot=0.057788343 avg=0.019262781 cnt=3
dotest: BNCDMP min=0.019061426 max=0.019560949
dotest: BNCDMP alg_0e -- beta_seq_index tot=0.253575597 avg=0.084525199 cnt=3
dotest: BNCDMP min=0.084169403 max=0.084902168
dotest: BNCDMP alg_0f -- nullsrc tot=1.718326633 avg=0.572775544 cnt=3
dotest: BNCDMP min=0.571082648 max=0.575134694
dotest: BNCDMP alg_1a -- pair tot=1.792905583 avg=0.597635194 cnt=3
dotest: BNCDMP min=0.590378177 max=0.603253947
dotest: BNCDMP alg_1b -- epair tot=1.797667694 avg=0.599222564 cnt=3
dotest: BNCDMP min=0.589916620 max=0.609757778
dotest: BNCDMP alg_1c -- betap tot=1.794606586 avg=0.598202195 cnt=3
dotest: BNCDMP min=0.593164605 max=0.604739478
dotest: BNCDMP alg_1d -- beta_fixed_index tot=0.073755595 avg=0.024585198 cnt=3
dotest: BNCDMP min=0.024126694 max=0.025124542
dotest: BNCDMP alg_1e -- beta_seq_index tot=0.261664945 avg=0.087221648 cnt=3
dotest: BNCDMP min=0.086277263 max=0.087966703
dotest: BNCDMP datagen_0b tot=4.160519571
dotest: BNCDMP dotest tot=14.607990774
17:39:59.970197677 ph: complete (ELAPSED: 00:00:24.418215274)