C++ 提高正弦/余弦和大型阵列的计算速度_C++_C_Visual C++_Visual C++ 2010

C++ 提高正弦/余弦和大型阵列的计算速度

c++ c visual-c++

C++ 提高正弦/余弦和大型阵列的计算速度,c++,c,visual-c++,visual-c++-2010,C++,C,Visual C++,Visual C++ 2010,对于信号处理，我需要计算相对较大的C数组，如下面的代码部分所示。到目前为止，这一切都很顺利，但不幸的是，实现速度很慢。“calibdata”的大小约为150k，需要针对不同的频率/相位进行计算。有没有办法显著提高速度？在MATLAB中使用逻辑索引进行同样的操作要快得多我已经试过了：使用正弦的泰勒近似：无显著改进使用std:：vector，也没有显著的改进代码： double phase_func（双*校准数据、长尺寸、双*频率标度、双fs、双相位、int currentcarrier

对于信号处理，我需要计算相对较大的C数组，如下面的代码部分所示。到目前为止，这一切都很顺利，但不幸的是，实现速度很慢。“calibdata”的大小约为150k，需要针对不同的频率/相位进行计算。有没有办法显著提高速度？在MATLAB中使用逻辑索引进行同样的操作要快得多

我已经试过了：

使用正弦的泰勒近似：无显著改进
使用std:：vector，也没有显著的改进

代码：

double phase_func（双*校准数据、长尺寸、双*频率标度、双fs、双相位、int currentcarrier）{
对于（int i=0；i


致以最良好的祝愿，
Thomas
您可以尝试使用基于复指数的余弦定义：

其中j^2=-1

存储exp（（2*PI*frequescale[currentcarrier]/fs）*j）
和exp（phase*j）
。评估cos（…）
然后恢复到for循环中的几个产品和添加，而sin（）
，cos（）
和exp（）
只调用了几次
执行情况如下：
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <complex.h>
#include <time.h> 

#define PI   3.141592653589

typedef struct cos_plan{
    double complex* expo;
    int size;
}cos_plan;

double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier){
    double result=0;  //initialization
    for (int i = 0; i < size; i++){

        result += calibdata[i] * cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) );

        //printf("i %d cos %g\n",i,cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) ));
    }
    result = fabs(result / size);

    return result;
}

double phase_func2(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier, cos_plan* plan){

    //first, let's compute the exponentials:
    //double complex phaseexp=cos(phase*(PI / 180.) - (PI / 2.))+sin(phase*(PI / 180.) - (PI / 2.))*I;
    //double complex phaseexpm=conj(phaseexp);

    double phasesin=sin(phase*(PI / 180.) - (PI / 2.));
    double phasecos=cos(phase*(PI / 180.) - (PI / 2.));

    if (plan->size<size){
        double complex *tmp=realloc(plan->expo,size*sizeof(double complex));
        if(tmp==NULL){fprintf(stderr,"realloc failed\n");exit(1);}
        plan->expo=tmp;
        plan->size=size;
    }

    plan->expo[0]=1;
    //plan->expo[1]=exp(2 *I* PI*freqscale[currentcarrier]/fs);
    plan->expo[1]=cos(2 * PI*freqscale[currentcarrier]/fs)+sin(2 * PI*freqscale[currentcarrier]/fs)*I;
    //printf("%g %g\n",creall(plan->expo[1]),cimagl(plan->expo[1]));
    for(int i=2;i<size;i++){
        if(i%2==0){
            plan->expo[i]=plan->expo[i/2]*plan->expo[i/2];
        }else{
            plan->expo[i]=plan->expo[i/2]*plan->expo[i/2+1];
        }
    }
    //computing the result
    double result=0;  //initialization
    for(int i=0;i<size;i++){
        //double coss=0.5*creall(plan->expo[i]*phaseexp+conj(plan->expo[i])*phaseexpm);
        double coss=creall(plan->expo[i])*phasecos-cimagl(plan->expo[i])*phasesin;
        //printf("i %d cos %g\n",i,coss);
        result+=calibdata[i] *coss;
    }

    result = fabs(result / size);

    return result;
}

int main(){
    //the parameters

    long n=100000000;
    double* calibdata=malloc(n*sizeof(double));
    if(calibdata==NULL){fprintf(stderr,"malloc failed\n");exit(1);}

    int freqnb=42;
    double* freqscale=malloc(freqnb*sizeof(double));
    if(freqscale==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    for (int i = 0; i < freqnb; i++){
        freqscale[i]=i*i*0.007+i;
    }

    double fs=n;

    double phase=0.05;

    //populate calibdata
    for (int i = 0; i < n; i++){
        calibdata[i]=i/((double)n);
        calibdata[i]=calibdata[i]*calibdata[i]-calibdata[i]+0.007/(calibdata[i]+3.0);
    }

    //call to sample code
    clock_t t;
    t = clock();
    double res=phase_func(calibdata,n, freqscale, fs, phase, 13);
    t = clock() - t;

    printf("first call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);


    //initialize
    cos_plan plan;
    plan.expo=malloc(n*sizeof(double complex));
    plan.size=n;

    t = clock();
    res=phase_func2(calibdata,n, freqscale, fs, phase, 13,&plan);
    t = clock() - t;

    printf("second call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);




    //cleaning

    free(plan.expo);

    free(calibdata);
    free(freqscale);

    return 0;
}

#包括
#包括
#包括
#包括
#包括
#定义PI 3.141592653589
类型定义结构cos_平面图{
双综合体*世博会；
整数大小；
}cos_计划；
双相位函数（双*校准数据、长尺寸、双*频率标度、双fs、双相位、int电流载波）{
double result=0；//初始化
对于（int i=0；i尺寸expo，尺寸*尺寸of（双复数））；
如果（tmp==NULL）{fprintf（stderr，“realloc失败”\n”）；退出（1）；}
计划->世博会=tmp；
计划->尺寸=尺寸；
}
计划->世博会[0]=1；
//计划->博览会[1]=exp（2*I*PI*freqscale[currentcarrier]/fs）；
计划->博览会[1]=cos（2*PI*frequescale[currentcharrier]/fs）+sin（2*PI*frequescale[currentcharrier]/fs）*I；
//printf（“%g%g\n”、creall（计划->博览会[1]）、cimagl（计划->博览会[1]）；
对于（INTI=2；iexpo[i]=plan->expo[i/2]*plan->expo[i/2]；
}否则{
计划->世博会[i]=计划->世博会[i/2]*计划->世博会[i/2+1]；
}
}
//计算结果
double result=0；//初始化
对于（int i=0；iexpo[i]*phaseexp+conj（计划->博览会[i]）*phaseexpm）；
双coss=creall（计划->世博会[i]）*phasecos cimagl（计划->世博会[i]）*phasesin；
//printf（“i%d cos%g\n”，i，cos）；
结果+=calibdata[i]*coss；
}
结果=晶圆厂（结果/尺寸）；
返回结果；
}
int main（）{
//参数
长n=100000000；
double*calibdata=malloc（n*sizeof（double））；
如果（calibdata==NULL）{fprintf（stderr，“malloc失败\n”）；退出（1）；}
int-freqnb=42；
double*freqscale=malloc（freqnb*sizeof（double））；
如果（freqscale==NULL）{fprintf（stderr，“malloc failed\n”）；退出（1）；}
对于（int i=0；i

使用gcc main.c-o main-std=c99-lm-Wall-O3
编译。使用您提供的代码，在我的计算机上使用size=100000000
需要8秒，而建议的解决方案的执行时间需要1.5秒。这并不令人印象深刻，但也不容忽视
所提出的解决方案不涉及在for循环中调用cos
或sin
。实际上，只有乘法和加法。瓶颈要么是内存带宽，要么是平方求幂中的内存测试和访问（最有可能是第一个问题，因为我添加了一个额外的复杂数组）
有关c中的复数，请参见：




如果问题是内存带宽，则需要并行性…直接计算cos
会更容易。如果frequescale[currentcharrier]/fs
是一个整数。您的问题非常接近的计算，目前的技巧接近离散傅里叶变换，FFTW库非常擅长计算这些变换
请注意，由于重要性的损失，当前代码可能会产生不准确的结果：res
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <complex.h>
#include <time.h> 

#define PI   3.141592653589

typedef struct cos_plan{
    double complex* expo;
    int size;
}cos_plan;

double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier){
    double result=0;  //initialization
    for (int i = 0; i < size; i++){

        result += calibdata[i] * cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) );

        //printf("i %d cos %g\n",i,cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) ));
    }
    result = fabs(result / size);

    return result;
}

double phase_func2(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier, cos_plan* plan){

    //first, let's compute the exponentials:
    //double complex phaseexp=cos(phase*(PI / 180.) - (PI / 2.))+sin(phase*(PI / 180.) - (PI / 2.))*I;
    //double complex phaseexpm=conj(phaseexp);

    double phasesin=sin(phase*(PI / 180.) - (PI / 2.));
    double phasecos=cos(phase*(PI / 180.) - (PI / 2.));

    if (plan->size<size){
        double complex *tmp=realloc(plan->expo,size*sizeof(double complex));
        if(tmp==NULL){fprintf(stderr,"realloc failed\n");exit(1);}
        plan->expo=tmp;
        plan->size=size;
    }

    plan->expo[0]=1;
    //plan->expo[1]=exp(2 *I* PI*freqscale[currentcarrier]/fs);
    plan->expo[1]=cos(2 * PI*freqscale[currentcarrier]/fs)+sin(2 * PI*freqscale[currentcarrier]/fs)*I;
    //printf("%g %g\n",creall(plan->expo[1]),cimagl(plan->expo[1]));
    for(int i=2;i<size;i++){
        if(i%2==0){
            plan->expo[i]=plan->expo[i/2]*plan->expo[i/2];
        }else{
            plan->expo[i]=plan->expo[i/2]*plan->expo[i/2+1];
        }
    }
    //computing the result
    double result=0;  //initialization
    for(int i=0;i<size;i++){
        //double coss=0.5*creall(plan->expo[i]*phaseexp+conj(plan->expo[i])*phaseexpm);
        double coss=creall(plan->expo[i])*phasecos-cimagl(plan->expo[i])*phasesin;
        //printf("i %d cos %g\n",i,coss);
        result+=calibdata[i] *coss;
    }

    result = fabs(result / size);

    return result;
}

int main(){
    //the parameters

    long n=100000000;
    double* calibdata=malloc(n*sizeof(double));
    if(calibdata==NULL){fprintf(stderr,"malloc failed\n");exit(1);}

    int freqnb=42;
    double* freqscale=malloc(freqnb*sizeof(double));
    if(freqscale==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
    for (int i = 0; i < freqnb; i++){
        freqscale[i]=i*i*0.007+i;
    }

    double fs=n;

    double phase=0.05;

    //populate calibdata
    for (int i = 0; i < n; i++){
        calibdata[i]=i/((double)n);
        calibdata[i]=calibdata[i]*calibdata[i]-calibdata[i]+0.007/(calibdata[i]+3.0);
    }

    //call to sample code
    clock_t t;
    t = clock();
    double res=phase_func(calibdata,n, freqscale, fs, phase, 13);
    t = clock() - t;

    printf("first call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);


    //initialize
    cos_plan plan;
    plan.expo=malloc(n*sizeof(double complex));
    plan.size=n;

    t = clock();
    res=phase_func2(calibdata,n, freqscale, fs, phase, 13,&plan);
    t = clock() - t;

    printf("second call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);




    //cleaning

    free(plan.expo);

    free(calibdata);
    free(freqscale);

    return 0;
}

double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier)
{
    double result = 0;
    double angle = phase * (PI / 180) - (PI / 2);
    double delta = 2 * PI * freqscale[currentcarrier] / fs;
    for (int i = 0; i < size; i++)
    {
        result += calibdata[i] * cos( angle );
        angle += delta;
    }
    return fabs(result / size);
}

register double result = 0.0;
register unsigned int i = 0U;
for (i = 0; i < size; i += 2)
{
    register double cos_angle1 = /* ... */;
    register double cos_angle2 = /* ... */;
    result += calibdata[i + 0] * cos_angle1;
    result += calibdata[i + 1] * cos_angle2;
}

cosine(x - π/2) == -sine(x)

// double phase_func(double* calibdata, long size, 
//     double* freqscale, double fs, double phase, int currentcarrier) {
double phase_func(const double* restrict calibdata, long size, 
    const double* restrict freqscale, double fs, double phase, int currentcarrier) {

// result += calibdata[i] * cos(...
result += calibdata[i] * cosf(...

double angle_delta = ...;
double angle_current = ...;
for (int i = 0; i < size; i++) {
  result += calibdata[i] * cos(angle_current);
  angle_current += angle_delta;
}

double phase_func2(const double* restrict calibdata, size_t size,
    const double* restrict freqscale, double fs, double phase,
    size_t currentcarrier) {

  double result = 0.0;
  double angle_delta = 2.0 * PI * freqscale[currentcarrier] / fs;
  double angle_current = angle_delta * (size - 1) + phase * (PI / 180);
  size_t i = size;
  while (i) {
    result -= calibdata[--i] * sinf(angle_current);
    angle_current -= angle_delta;
  }
  result = fabs(result / size);
  return result;
}

double phase_func3(double* calibdata, const int size, const double* freqscale, 
    const double fs, const double phase, const size_t currentcarrier)
{
    double result{};
    constexpr double PI = 3.141592653589;

#pragma omp parallel
#pragma omp for reduction(+: result)
    for (int i = 0; i < size; ++i) {
        result += calibdata[i] *
            cos( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.0) - (PI / 2.0)));
    }
    result = fabs(result / size);
    return result;
}