C++ 提高正弦/余弦和大型阵列的计算速度
对于信号处理,我需要计算相对较大的C数组,如下面的代码部分所示。到目前为止,这一切都很顺利,但不幸的是,实现速度很慢。“calibdata”的大小约为150k,需要针对不同的频率/相位进行计算。有没有办法显著提高速度?在MATLAB中使用逻辑索引进行同样的操作要快得多 我已经试过了:C++ 提高正弦/余弦和大型阵列的计算速度,c++,c,visual-c++,visual-c++-2010,C++,C,Visual C++,Visual C++ 2010,对于信号处理,我需要计算相对较大的C数组,如下面的代码部分所示。到目前为止,这一切都很顺利,但不幸的是,实现速度很慢。“calibdata”的大小约为150k,需要针对不同的频率/相位进行计算。有没有办法显著提高速度?在MATLAB中使用逻辑索引进行同样的操作要快得多 我已经试过了: 使用正弦的泰勒近似:无显著改进 使用std::vector,也没有显著的改进 代码: double phase_func(双*校准数据、长尺寸、双*频率标度、双fs、双相位、int currentcarrier
- 使用正弦的泰勒近似:无显著改进
- 使用std::vector,也没有显著的改进
double phase_func(双*校准数据、长尺寸、双*频率标度、双fs、双相位、int currentcarrier){
对于(int i=0;i
致以最良好的祝愿,
Thomas您可以尝试使用基于复指数的余弦定义: 其中
j^2=-1
存储exp((2*PI*frequescale[currentcarrier]/fs)*j)
和exp(phase*j)
。评估cos(…)
然后恢复到for循环中的几个产品和添加,而sin()
,cos()
和exp()
只调用了几次
执行情况如下:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <complex.h>
#include <time.h>
#define PI 3.141592653589
typedef struct cos_plan{
double complex* expo;
int size;
}cos_plan;
double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier){
double result=0; //initialization
for (int i = 0; i < size; i++){
result += calibdata[i] * cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) );
//printf("i %d cos %g\n",i,cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) ));
}
result = fabs(result / size);
return result;
}
double phase_func2(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier, cos_plan* plan){
//first, let's compute the exponentials:
//double complex phaseexp=cos(phase*(PI / 180.) - (PI / 2.))+sin(phase*(PI / 180.) - (PI / 2.))*I;
//double complex phaseexpm=conj(phaseexp);
double phasesin=sin(phase*(PI / 180.) - (PI / 2.));
double phasecos=cos(phase*(PI / 180.) - (PI / 2.));
if (plan->size<size){
double complex *tmp=realloc(plan->expo,size*sizeof(double complex));
if(tmp==NULL){fprintf(stderr,"realloc failed\n");exit(1);}
plan->expo=tmp;
plan->size=size;
}
plan->expo[0]=1;
//plan->expo[1]=exp(2 *I* PI*freqscale[currentcarrier]/fs);
plan->expo[1]=cos(2 * PI*freqscale[currentcarrier]/fs)+sin(2 * PI*freqscale[currentcarrier]/fs)*I;
//printf("%g %g\n",creall(plan->expo[1]),cimagl(plan->expo[1]));
for(int i=2;i<size;i++){
if(i%2==0){
plan->expo[i]=plan->expo[i/2]*plan->expo[i/2];
}else{
plan->expo[i]=plan->expo[i/2]*plan->expo[i/2+1];
}
}
//computing the result
double result=0; //initialization
for(int i=0;i<size;i++){
//double coss=0.5*creall(plan->expo[i]*phaseexp+conj(plan->expo[i])*phaseexpm);
double coss=creall(plan->expo[i])*phasecos-cimagl(plan->expo[i])*phasesin;
//printf("i %d cos %g\n",i,coss);
result+=calibdata[i] *coss;
}
result = fabs(result / size);
return result;
}
int main(){
//the parameters
long n=100000000;
double* calibdata=malloc(n*sizeof(double));
if(calibdata==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
int freqnb=42;
double* freqscale=malloc(freqnb*sizeof(double));
if(freqscale==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
for (int i = 0; i < freqnb; i++){
freqscale[i]=i*i*0.007+i;
}
double fs=n;
double phase=0.05;
//populate calibdata
for (int i = 0; i < n; i++){
calibdata[i]=i/((double)n);
calibdata[i]=calibdata[i]*calibdata[i]-calibdata[i]+0.007/(calibdata[i]+3.0);
}
//call to sample code
clock_t t;
t = clock();
double res=phase_func(calibdata,n, freqscale, fs, phase, 13);
t = clock() - t;
printf("first call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);
//initialize
cos_plan plan;
plan.expo=malloc(n*sizeof(double complex));
plan.size=n;
t = clock();
res=phase_func2(calibdata,n, freqscale, fs, phase, 13,&plan);
t = clock() - t;
printf("second call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);
//cleaning
free(plan.expo);
free(calibdata);
free(freqscale);
return 0;
}
#包括
#包括
#包括
#包括
#包括
#定义PI 3.141592653589
类型定义结构cos_平面图{
双综合体*世博会;
整数大小;
}cos_计划;
双相位函数(双*校准数据、长尺寸、双*频率标度、双fs、双相位、int电流载波){
double result=0;//初始化
对于(int i=0;i尺寸expo,尺寸*尺寸of(双复数));
如果(tmp==NULL){fprintf(stderr,“realloc失败”\n”);退出(1);}
计划->世博会=tmp;
计划->尺寸=尺寸;
}
计划->世博会[0]=1;
//计划->博览会[1]=exp(2*I*PI*freqscale[currentcarrier]/fs);
计划->博览会[1]=cos(2*PI*frequescale[currentcharrier]/fs)+sin(2*PI*frequescale[currentcharrier]/fs)*I;
//printf(“%g%g\n”、creall(计划->博览会[1])、cimagl(计划->博览会[1]);
对于(INTI=2;iexpo[i]=plan->expo[i/2]*plan->expo[i/2];
}否则{
计划->世博会[i]=计划->世博会[i/2]*计划->世博会[i/2+1];
}
}
//计算结果
double result=0;//初始化
对于(int i=0;iexpo[i]*phaseexp+conj(计划->博览会[i])*phaseexpm);
双coss=creall(计划->世博会[i])*phasecos cimagl(计划->世博会[i])*phasesin;
//printf(“i%d cos%g\n”,i,cos);
结果+=calibdata[i]*coss;
}
结果=晶圆厂(结果/尺寸);
返回结果;
}
int main(){
//参数
长n=100000000;
double*calibdata=malloc(n*sizeof(double));
如果(calibdata==NULL){fprintf(stderr,“malloc失败\n”);退出(1);}
int-freqnb=42;
double*freqscale=malloc(freqnb*sizeof(double));
如果(freqscale==NULL){fprintf(stderr,“malloc failed\n”);退出(1);}
对于(int i=0;i
使用gcc main.c-o main-std=c99-lm-Wall-O3
编译。使用您提供的代码,在我的计算机上使用size=100000000
需要8秒,而建议的解决方案的执行时间需要1.5秒。这并不令人印象深刻,但也不容忽视
所提出的解决方案不涉及在for循环中调用cos
或sin
。实际上,只有乘法和加法。瓶颈要么是内存带宽,要么是平方求幂中的内存测试和访问(最有可能是第一个问题,因为我添加了一个额外的复杂数组)
有关c中的复数,请参见:
cos
会更容易。如果frequescale[currentcharrier]/fs
是一个整数。您的问题非常接近的计算,目前的技巧接近离散傅里叶变换,FFTW库非常擅长计算这些变换
请注意,由于重要性的损失,当前代码可能会产生不准确的结果:res
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <complex.h>
#include <time.h>
#define PI 3.141592653589
typedef struct cos_plan{
double complex* expo;
int size;
}cos_plan;
double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier){
double result=0; //initialization
for (int i = 0; i < size; i++){
result += calibdata[i] * cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) );
//printf("i %d cos %g\n",i,cos ( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.) - (PI / 2.)) ));
}
result = fabs(result / size);
return result;
}
double phase_func2(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier, cos_plan* plan){
//first, let's compute the exponentials:
//double complex phaseexp=cos(phase*(PI / 180.) - (PI / 2.))+sin(phase*(PI / 180.) - (PI / 2.))*I;
//double complex phaseexpm=conj(phaseexp);
double phasesin=sin(phase*(PI / 180.) - (PI / 2.));
double phasecos=cos(phase*(PI / 180.) - (PI / 2.));
if (plan->size<size){
double complex *tmp=realloc(plan->expo,size*sizeof(double complex));
if(tmp==NULL){fprintf(stderr,"realloc failed\n");exit(1);}
plan->expo=tmp;
plan->size=size;
}
plan->expo[0]=1;
//plan->expo[1]=exp(2 *I* PI*freqscale[currentcarrier]/fs);
plan->expo[1]=cos(2 * PI*freqscale[currentcarrier]/fs)+sin(2 * PI*freqscale[currentcarrier]/fs)*I;
//printf("%g %g\n",creall(plan->expo[1]),cimagl(plan->expo[1]));
for(int i=2;i<size;i++){
if(i%2==0){
plan->expo[i]=plan->expo[i/2]*plan->expo[i/2];
}else{
plan->expo[i]=plan->expo[i/2]*plan->expo[i/2+1];
}
}
//computing the result
double result=0; //initialization
for(int i=0;i<size;i++){
//double coss=0.5*creall(plan->expo[i]*phaseexp+conj(plan->expo[i])*phaseexpm);
double coss=creall(plan->expo[i])*phasecos-cimagl(plan->expo[i])*phasesin;
//printf("i %d cos %g\n",i,coss);
result+=calibdata[i] *coss;
}
result = fabs(result / size);
return result;
}
int main(){
//the parameters
long n=100000000;
double* calibdata=malloc(n*sizeof(double));
if(calibdata==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
int freqnb=42;
double* freqscale=malloc(freqnb*sizeof(double));
if(freqscale==NULL){fprintf(stderr,"malloc failed\n");exit(1);}
for (int i = 0; i < freqnb; i++){
freqscale[i]=i*i*0.007+i;
}
double fs=n;
double phase=0.05;
//populate calibdata
for (int i = 0; i < n; i++){
calibdata[i]=i/((double)n);
calibdata[i]=calibdata[i]*calibdata[i]-calibdata[i]+0.007/(calibdata[i]+3.0);
}
//call to sample code
clock_t t;
t = clock();
double res=phase_func(calibdata,n, freqscale, fs, phase, 13);
t = clock() - t;
printf("first call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);
//initialize
cos_plan plan;
plan.expo=malloc(n*sizeof(double complex));
plan.size=n;
t = clock();
res=phase_func2(calibdata,n, freqscale, fs, phase, 13,&plan);
t = clock() - t;
printf("second call got %g in %g seconds.\n",res,((float)t)/CLOCKS_PER_SEC);
//cleaning
free(plan.expo);
free(calibdata);
free(freqscale);
return 0;
}
double phase_func(double* calibdata, long size, double* freqscale, double fs, double phase, int currentcarrier)
{
double result = 0;
double angle = phase * (PI / 180) - (PI / 2);
double delta = 2 * PI * freqscale[currentcarrier] / fs;
for (int i = 0; i < size; i++)
{
result += calibdata[i] * cos( angle );
angle += delta;
}
return fabs(result / size);
}
register double result = 0.0;
register unsigned int i = 0U;
for (i = 0; i < size; i += 2)
{
register double cos_angle1 = /* ... */;
register double cos_angle2 = /* ... */;
result += calibdata[i + 0] * cos_angle1;
result += calibdata[i + 1] * cos_angle2;
}
cosine(x - π/2) == -sine(x)
// double phase_func(double* calibdata, long size,
// double* freqscale, double fs, double phase, int currentcarrier) {
double phase_func(const double* restrict calibdata, long size,
const double* restrict freqscale, double fs, double phase, int currentcarrier) {
// result += calibdata[i] * cos(...
result += calibdata[i] * cosf(...
double angle_delta = ...;
double angle_current = ...;
for (int i = 0; i < size; i++) {
result += calibdata[i] * cos(angle_current);
angle_current += angle_delta;
}
double phase_func2(const double* restrict calibdata, size_t size,
const double* restrict freqscale, double fs, double phase,
size_t currentcarrier) {
double result = 0.0;
double angle_delta = 2.0 * PI * freqscale[currentcarrier] / fs;
double angle_current = angle_delta * (size - 1) + phase * (PI / 180);
size_t i = size;
while (i) {
result -= calibdata[--i] * sinf(angle_current);
angle_current -= angle_delta;
}
result = fabs(result / size);
return result;
}
double phase_func3(double* calibdata, const int size, const double* freqscale,
const double fs, const double phase, const size_t currentcarrier)
{
double result{};
constexpr double PI = 3.141592653589;
#pragma omp parallel
#pragma omp for reduction(+: result)
for (int i = 0; i < size; ++i) {
result += calibdata[i] *
cos( (2 * PI*freqscale[currentcarrier] * i / fs) + (phase*(PI / 180.0) - (PI / 2.0)));
}
result = fabs(result / size);
return result;
}