Performance c代码从嵌套for循环开始运行速度变慢_Performance_Loops_For Loop_Nested

Performance c代码从嵌套for循环开始运行速度变慢

performance loops for-loop

Performance c代码从嵌套for循环开始运行速度变慢,performance,loops,for-loop,nested,Performance,Loops,For Loop,Nested,我的c程序运行速度很慢（现在大约40秒，没有并行化）。我尝试过使用openmp，这大大降低了计时，但我希望使用简单自然的方法，使我的代码运行得更快，而不是使用并行for循环。代码的基本结构是将一些命令行参数作为输入，然后将这些输入保存为变量。然后，它使用math.h库和complex.h库递归计算一个名为Rplus1的变量。代码的问题以及它占用大部分时间的地方是在循环嵌套的底部。我的目标是让整个代码在5秒内运行，但到目前为止，它在40秒内运行，而不使用并行for循环。请帮忙 #include "

我的c程序运行速度很慢（现在大约40秒，没有并行化）。我尝试过使用openmp，这大大降低了计时，但我希望使用简单自然的方法，使我的代码运行得更快，而不是使用并行for循环。代码的基本结构是将一些命令行参数作为输入，然后将这些输入保存为变量。然后，它使用math.h库和complex.h库递归计算一个名为Rplus1的变量。代码的问题以及它占用大部分时间的地方是在循环嵌套的底部。我的目标是让整个代码在5秒内运行，但到目前为止，它在40秒内运行，而不使用并行for循环。请帮忙

#include "time.h"
#include "stdio.h"
#include "stdlib.h"
#include "complex.h"
#include "math.h"
#include "string.h"
#include "unistd.h"
#include "omp.h"
#define PI 3.14159265


int main (int argc, char *argv[]){
if(argc >= 8){

double start1 = omp_get_wtime();

// command line arguments are aligned in the following order: [theta] [number of layers in superlattice] [material_1] [lat const_1] [number of unit cells_1] [material_2] [lat const_2] [number of unit cells_2] .... [material_N] [lat const_N] [number of unit cells_N] [Log/Linear] [number of repeating superlattice layers] [yes/no]

int N;
sscanf(argv[2],"%d",&N); // Number of layers in superlattice specified by second input argument


if(strcmp(argv[argc-1],"yes") == 0) //If the substrate is included then add one more layer to the N variable
{
        N = N+1;
}

int total;
sscanf(argv[argc-2],"%d",&total); // Number of repeating superlattice layers specified by second to last argument

double layers[N][6], horizangle[1001], vertangle[1001]; 

double complex (*F_hkl)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*F_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)), (*g_0)[1001][1001] = malloc(N*1001*1001*sizeof(complex double)),SF_table[10];// this array will hold the unit cell structure factors for all of the materials selected for each wavevector in the beam spectrum

double real, real2, lam, c_light = 299792458, h_pl = 4.135667516e-15,E = 10e3, r_0 = 2.818e-15, Lccd = 1.013;// just a few variables to hold values through calculations and constants, speed of light, plancks const, photon energy, and detector distance from sample

double angle;

double complex z;// just a variable to hold complex numbers throughout calculations

int i,j,m,n,t; // integers to index through arrays

lam = (h_pl*c_light)/E;

sscanf(argv[1],"%lf",&angle); //first argument is the angle of incidence, read it
angle = angle*(PI/180.0);
angle2 = -angle;


double (*table)[10] = malloc(10*9*sizeof(double)); // this array holds all the coefficients to calculate the atomic scattering factor below
double (*table2)[10] = malloc(10*2*sizeof(double));

FILE*datfile1 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin","rb"); // read the binary file containg all the coefficients
fread(table,sizeof(double),90,datfile1);
fclose(datfile1);

FILE*datfile2 = fopen("/home/vhosts/xraydev.engr.wisc.edu/data/dispersioncs.bin","rb");
fread(table2,sizeof(double),20,datfile2);
fclose(datfile2);

// Calculate scattering factors for all elements
double a,b;
double k_z = (sin(angle)/lam)*1e-10; // incorporate angular dependence of SF but neglect 0.24 degree divergence because of approximation

for(i = 0;i<10;i++) // for each element...
{
    SF_table[i] = 0;
    for(j = 0;j<4;j++) // summation
    {
        a = table[2*j][i];
        b = table[2*j+1][i];
        SF_table[i] = SF_table[i] + a * exp(-b*k_z*k_z);
    }
    SF_table[i] = SF_table[i] + table[8][i] + table2[0][i] + table2[1][i]*I; 
}

free(table);



double mm = 4.0, (*phi)[1001][1001] = malloc(N*1001*1001*sizeof(double));

for(i = 1; i < N+1; i++) // for each layer of material...
{

    sscanf(argv[i*3+1],"%lf",&layers[i-1][1]);  // get out of plane lattice constant

    sscanf(argv[i*3+2],"%lf",&layers[i-1][2]);  // get the number of unit cells in the layer


    layers[i-1][1] = layers[i-1][1]*1e-10; // convert lat const input to meters



// Define reciprocal space positions at the incident angle h, k, l

    layers[i-1][3] = 0; // h
    layers[i-1][4] = 0; // k

    double l; // l calculated for each wavevector in the spectrum because l changes with angle of incidence



    for (m = 0; m < 1001; m++)
    {
        for (n = 0; n <1001; n++)
        {

        l = 4;

        phi[i-1][m][n] = 2*PI*layers[i-1][1]*sin(angle)/lam; // Caculate phi for each layer

        if(strcmp(argv[i*3],"GaAs") == 0)
        {
            F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*(SF_table[2]+SF_table[3]*cexp(I*PI*l/2));
            F_0[i-1][m][n] = 0.5*8.0*(31 + table2[0][2] + table2[1][2]*I) + 0.5*8.0*(33 + table2[0][3] + table2[1][3]*I);
            g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
            g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
        }

        if(strcmp(argv[i*3],"AlGaAs") == 0)
        {
            F_hkl[i-1][m][n] = (2+2*cexp(I*PI*l))*((0.76*SF_table[2]+ 0.24*SF_table[4])+SF_table[3]*cexp(I*PI*l/2));
            F_0[i-1][m][n] = 0.24*4.0*(13 + table2[0][4] + table2[1][4]*I) + 0.76*4.0*(31 + table2[0][2] + table2[1][2]*I) + 4.0*(33 + table2[0][3] + table2[1][3]*I);
            g[i-1][m][n] = 2*r_0*F_hkl[i-1][m][n]/mm/layers[i-1][1]*cos(2*angle[m][n]);
            g_0[i-1][m][n] = 2*r_0*F_0[i-1][m][n]/mm/layers[i-1][1];
        }
      }
    }
}


   double complex (*Rplus1)[1001] = malloc(1001*1001*sizeof(double complex));

    for (m = 0; m < 1001; m++)
    {
            for (n = 0; n <1001; n++)
            {

            Rplus1[m][n] = 0.0;
            }
    }


double stop1 = omp_get_wtime();

                    for(i=1;i<N;i++) // For each layer of the film
                    {
                            for(j=0;j<layers[i][2];j++) // For each unit cell
                            {
                                    for (m = 0; m < 1001; m++) // For each row of the diffraction pattern
                                    {
                                            for (n = 0; n <1001; n++) // For each column of the diffraction pattern
                                            {
                                            Rplus1[m][n] = -I*g[i][m][n] + ((1-I*g_0[i][m][n])*(1-I*g_0[i][m][n]))/(I*g[i][m][n] + (cos(-2*phi[i][m][n])+I*sin(-2*phi[i][m][n]))/Rplus1[m][n]);
                                            }
                                    }
                            }
                    }

double stop2 = omp_get_wtime();


double elapsed1 = (double)(stop1 - start1);// Second user defined function to use Durbin and Follis recursive formula
double elapsed2 = (double)(stop2 - start1);// Second user defined function to use Durbin and Follis recursive formula
printf("main() through before diffraction function took %f seconds to run\n\n",elapsed1);
printf("main() through after diffraction function took %f seconds to run\n\n",elapsed2);


}

#包括“time.h”
#包括“stdio.h”
#包括“stdlib.h”
#包括“complex.h”
#包括“math.h”
#包括“string.h”
#包括“unistd.h”
#包括“omp.h”
#定义PI 3.14159265
int main（int argc，char*argv[]）{
如果（argc>=8）{
double start1=omp_get_wtime（）；
//命令行参数按以下顺序对齐：[θ][超晶格层数][材料层1][横向常数层1][单元单元单元数层1][材料层2][横向常数层2][单元数层2][材料层][横向常数层][单元单元数层][对数/线性][重复超晶格层数][是/否]
int N；
sscanf（argv[2]，“%d”，&N）；//由第二个输入参数指定的超晶格层数
如果（strcmp（argv[argc-1]，“yes”）==0）//如果包括基板，则向N变量再添加一层
{
N=N+1；
}
整数合计；
sscanf（argv[argc-2]、“%d”、&total）；//由倒数第二个参数指定的重复超晶格层数
双层[N][6]，水平角[1001]，垂直角[1001]；
双复数（*F_hkl）[1001][1001]=malloc（N*1001*1001*sizeof（复数双复数）），（*F_0）[1001][1001]=malloc（N*1001*1001*sizeof（复数双复数双复数）），（*g[1001]；//此数组将保存为波束谱中每个波向量选择的所有材料的单位胞结构因子
双实数，real2，lam，c_light=299792458，h_pl=4.135667516e-15，E=10e3，r_0=2.818e-15，Lccd=1.013；//通过计算和常数、光速、普朗克常数、光子能量和探测器到样本的距离来保存值的几个变量
双角度；
双复数z；//只是一个在整个计算过程中保存复数的变量
int i，j，m，n，t；//通过数组索引的整数
lam=（高亮度*低亮度）/E；
sscanf（argv[1]，“%lf”，&angle）；//第一个参数是入射角，请读取
角度=角度*（PI/180.0）；
角度2=-角度；
double（*table）[10]=malloc（10*9*sizeof（double））；//此数组包含用于计算以下原子散射因子的所有系数
double（*表2）[10]=malloc（10*2*sizeof（double））；
FILE*datfile1=fopen（“/home/vhosts/xraydev.engr.wisc.edu/data/coef_table.bin”，“rb”）；//读取包含所有系数的二进制文件
fread（表，尺寸（双），90，数据文件1）；
fclose（datfile1）；
文件*datfile2=fopen（“/home/vhosts/xraydev.engr.wisc.edu/data/dispersions.bin”，“rb”）；
fread（表2，尺寸（双），20，数据文件2）；
fclose（datfile2）；
//计算所有元素的散射因子
双a，b；
双k_z=（sin（角度）/lam）*1e-10；//包含SF的角度依赖性，但由于近似，忽略了0.24度发散
对于（i=0；我认为您需要问一个比“请帮助”更具体的问题。如果您将malloc更改为calloc，您可以摆脱将数组设置为零的两个for循环。我不确定这将使您获得多大的速度。您是否使用-O3开关编译？您好，wdudzik，我认为calloc将数组设置为零，malloc只是分配内存？所以，这不是反过来吗？我想你需要问一个比“请帮助”更具体的问题。如果你将malloc改为calloc，你可以去掉两个将数组设置为零的for循环。我不确定这会给你带来多大的速度。你是用-O3开关编译的吗？您好，wdudzik，我想calloc会将数组设置为零和malloc只是分配内存？所以不是相反吗？