快速及;C语言中的高效最小二乘拟合算法?

快速及;C语言中的高效最小二乘拟合算法?,c,algorithm,least-squares,C,Algorithm,Least Squares,我试图在两个数据数组上实现线性最小二乘拟合:时间与振幅。到目前为止,我知道的唯一技术是测试(y=m*x+b)中所有可能的m和b点,然后找出最适合我的数据的组合,以便误差最小。然而,我认为迭代这么多的组合有时是无用的,因为它测试了所有的东西。有什么我不知道的加速过程的技术吗?谢谢 有有效的最小二乘拟合算法;有关详细信息,请参阅。还有一些库可以为您实现算法,可能比简单的实现更有效;这是一个例子,但也有其他更宽松的许可证。请参阅本手册的第1节。本节将2D线性回归表示为矩阵乘法练习。只要您的数据表现良好

我试图在两个数据数组上实现线性最小二乘拟合:时间与振幅。到目前为止,我知道的唯一技术是测试(y=m*x+b)中所有可能的m和b点,然后找出最适合我的数据的组合,以便误差最小。然而,我认为迭代这么多的组合有时是无用的,因为它测试了所有的东西。有什么我不知道的加速过程的技术吗?谢谢

有有效的最小二乘拟合算法;有关详细信息,请参阅。还有一些库可以为您实现算法,可能比简单的实现更有效;这是一个例子,但也有其他更宽松的许可证。

请参阅本手册的第1节。本节将2D线性回归表示为矩阵乘法练习。只要您的数据表现良好,这种技术应该允许您快速开发最小二乘拟合


根据数据的大小,可以通过代数方法将矩阵乘法简化为一组简单的方程,从而避免编写matmult()函数。(请预先警告,对于超过4或5个数据点,这是完全不切实际的!)

尝试此代码。它将
y=mx+b
与您的(x,y)数据相匹配

linreg
的参数为

linreg(int n, REAL x[], REAL y[], REAL* b, REAL* m, REAL* r)

n = number of data points
x,y  = arrays of data
*b = output intercept
*m  = output slope
*r = output correlation coefficient (can be NULL if you don't want it)
成功时返回值为0=失败时为0

这是密码

#include "linreg.h"
#include <stdlib.h>
#include <math.h>                           /* math functions */

//#define REAL float
#define REAL double


inline static REAL sqr(REAL x) {
    return x*x;
}


int linreg(int n, const REAL x[], const REAL y[], REAL* m, REAL* b, REAL* r){
    REAL   sumx = 0.0;                      /* sum of x     */
    REAL   sumx2 = 0.0;                     /* sum of x**2  */
    REAL   sumxy = 0.0;                     /* sum of x * y */
    REAL   sumy = 0.0;                      /* sum of y     */
    REAL   sumy2 = 0.0;                     /* sum of y**2  */

    for (int i=0;i<n;i++){ 
        sumx  += x[i];       
        sumx2 += sqr(x[i]);  
        sumxy += x[i] * y[i];
        sumy  += y[i];      
        sumy2 += sqr(y[i]); 
    } 

    REAL denom = (n * sumx2 - sqr(sumx));
    if (denom == 0) {
        // singular matrix. can't solve the problem.
        *m = 0;
        *b = 0;
        if (r) *r = 0;
            return 1;
    }

    *m = (n * sumxy  -  sumx * sumy) / denom;
    *b = (sumy * sumx2  -  sumx * sumxy) / denom;
    if (r!=NULL) {
        *r = (sumxy - sumx * sumy / n) /    /* compute correlation coeff */
              sqrt((sumx2 - sqr(sumx)/n) *
              (sumy2 - sqr(sumy)/n));
    }

    return 0; 
}
这是输出 这是Excel绘图和线性拟合(用于验证)。 所有值都与上面的C代码完全一致(注意C代码返回
r
,而Excel返回
r**2

来自《数字配方:科学计算的艺术》(15.2)将数据拟合成直线:

线性回归: 考虑将一组N个数据点(xi,yi)拟合到直线模型的问题:

假定不确定性:与每个彝相关的SigMi和席氏(因变量的值)是精确已知的。为了测量模型与数据的吻合程度,我们使用卡方函数,在这种情况下,卡方函数为:

将上述方程式最小化,以确定a和b。这是通过找到上述方程对a和b的导数,将它们等于零,然后求解a和b。然后我们估计a和b估计中可能的不确定性,因为数据中的测量误差显然会在确定这些参数时引入一些不确定性。此外,我们还必须估计数据的拟合优度 模型没有这一估计,我们丝毫没有迹象表明模型中的参数a和b有任何意义

以下结构执行上述计算:

struct Fitab {
// Object for fitting a straight line y = a + b*x to a set of 
// points (xi, yi), with or without available
// errors sigma i . Call one of the two constructors to calculate the fit.
// The answers are then available as the variables:
// a, b, siga, sigb, chi2, and either q or sigdat.
int ndata;
double a, b, siga, sigb, chi2, q, sigdat; // Answers.
vector<double> &x, &y, &sig;
// Constructor. 
Fitab(vector<double> &xx, vector<double> &yy, vector<double> &ssig)
    : ndata(xx.size()), x(xx), y(yy), sig(ssig), chi2(0.), q(1.), sigdat(0.) 
  {
    // Given a set of data points x[0..ndata-1], y[0..ndata-1] 
    // with individual standard deviations sig[0..ndata-1], 
    // sets a,b and their respective probable uncertainties
    // siga and sigb, the chi-square: chi2, and the goodness-of-fit
    //  probability: q  
    Gamma gam;
    int i;
    double ss=0., sx=0., sy=0., st2=0., t, wt, sxoss; b=0.0; 

    for (i=0;i < ndata; i++) { // Accumulate sums ...
        wt = 1.0 / SQR(sig[i]); //...with weights
        ss += wt;
        sx += x[i]*wt;
        sy += y[i]*wt;
    }
    sxoss = sx/ss;

    for (i=0; i < ndata; i++) {
        t = (x[i]-sxoss) / sig[i];
        st2 += t*t;
        b += t*y[i]/sig[i];
    }
    b /= st2; // Solve for a, b, sigma-a, and simga-b.
    a = (sy-sx*b) / ss;
    siga = sqrt((1.0+sx*sx/(ss*st2))/ss);
    sigb = sqrt(1.0/st2); // Calculate chi2.
    for (i=0;i<ndata;i++) chi2 += SQR((y[i]-a-b*x[i])/sig[i]);

    if (ndata>2) q=gam.gammq(0.5*(ndata-2),0.5*chi2); // goodness of fit
  }
// Constructor. 
Fitab(vector<double> &xx, vector<double> &yy)
    : ndata(xx.size()), x(xx), y(yy), sig(xx), chi2(0.), q(1.), sigdat(0.) 
  {
    // As above, but without known errors (sig is not used). 
    // The uncertainties siga and sigb are estimated by assuming
    // equal errors for all points, and that a straight line is
    // a good fit. q is returned as 1.0, the normalization of chi2
    // is to unit standard deviation on all points, and sigdat
    // is set to the estimated error of each point.
    int i;
    double ss,sx=0.,sy=0.,st2=0.,t,sxoss;
    b=0.0; // Accumulate sums ...
    for (i=0; i < ndata; i++) {
        sx += x[i]; // ...without weights.
        sy += y[i];
    }
    ss = ndata;
    sxoss = sx/ss;
    for (i=0;i < ndata; i++) {
        t = x[i]-sxoss;
        st2 += t*t;
        b += t*y[i];
    }
    b /= st2;  // Solve for a, b, sigma-a, and sigma-b.
    a = (sy-sx*b)/ss;
    siga=sqrt((1.0+sx*sx/(ss*st2))/ss);
    sigb=sqrt(1.0/st2); // Calculate chi2.
    for (i=0;i<ndata;i++) chi2 += SQR(y[i]-a-b*x[i]);

    if (ndata > 2) sigdat=sqrt(chi2/(ndata-2)); 
    // For unweighted data evaluate typical
    // sig using chi2, and adjust
    // the standard deviations.
    siga *= sigdat;
    sigb *= sigdat;
  }
};  
stuct gaule18

struct Gauleg18 {
// Abscissas and weights for Gauss-Legendre quadrature.
   static const Int ngau = 18;
   static const Doub y[18];
   static const Doub w[18];
};

const Doub Gauleg18::y[18] = {0.0021695375159141994,
   0.011413521097787704,0.027972308950302116,0.051727015600492421,
   0.082502225484340941, 0.12007019910960293,0.16415283300752470,
   0.21442376986779355, 0.27051082840644336, 0.33199876341447887,
   0.39843234186401943, 0.46931971407375483, 0.54413605556657973,
   0.62232745288031077, 0.70331500465597174, 0.78649910768313447,
   0.87126389619061517, 0.95698180152629142};

const Doub Gauleg18::w[18] = {0.0055657196642445571,
   0.012915947284065419,0.020181515297735382,0.027298621498568734,
   0.034213810770299537,0.040875750923643261,0.047235083490265582,
   0.053244713977759692,0.058860144245324798,0.064039797355015485
   0.068745323835736408,0.072941885005653087,0.076598410645870640,
   0.079687828912071670,0.082187266704339706,0.084078218979661945,
   0.085346685739338721,0.085983275670394821};
最后是fuinction
Gamma::invgamp()

double Gamma::invgammap(double p,double a){
//对于介于0和1之间的参数P,返回x,使得P(a,x)=P。
int j;
双x,err,t,u,pp,lna1,afac,a1=a-1;
const double EPS=1.e-8;//精度是EPS的平方。
gln=gammln(a);
如果(a=1.)返回最大值(100.,a+100.*sqrt(a));
如果(第1页){
lna1=对数(a1);
afac=exp(a1*(lna1-1.)-gln);
pp=(p<0.5)?p:1.-p;
t=sqrt(-2.*log(pp));
x=(2.30753+t*0.27061)/(1.+t*(0.99229+t*0.04481))-t;
如果(p<0.5)x=-x;
x=MAX(1.e-3,a*pow(1.-1./(9.*a)-x/(3.*sqrt(a)),3));
}否则{
t=1.0-a*(0.253+a*0.12);和(6.2.9)。
如果(p对于(j=0;j上面的原始示例在斜率和偏移量方面对我来说效果很好,但我很难使用corr coef。也许我的括号的工作方式与假定的优先级不一样?无论如何,在其他网页的帮助下,我最终得到了与Excel中的线性趋势线匹配的值。我想使用Mark Lak共享我的代码ata的变量名。希望这有帮助

double slope = ((n * sumxy) - (sumx * sumy )) / denom;
double intercept = ((sumy * sumx2) - (sumx * sumxy)) / denom;
double term1 = ((n * sumxy) - (sumx * sumy));
double term2 = ((n * sumx2) - (sumx * sumx));
double term3 = ((n * sumy2) - (sumy * sumy));
double term23 = (term2 * term3);
double r2 = 1.0;
if (fabs(term23) > MIN_DOUBLE)  // Define MIN_DOUBLE somewhere as 1e-9 or similar
    r2 = (term1 * term1) / term23;

据我所知,求解最小二乘法最快、最有效的方法是从参数向量中减去(梯度)/(二阶梯度)(二阶梯度=即海森曲线的对角线)

以下是直觉:

假设你想在单个参数上优化最小二乘法。这相当于找到抛物线的顶点。然后,对于任何随机初始参数x0,损失函数的顶点位于x0-f(1)/f(2)。这是因为将-f(1)/f(2)与x相加将始终使导数f(1)为零


旁注:在Tensorflow中实现这一点,解决方案出现在w0-f(1)/f(2)/(权重数),但我不确定这是由于Tensorflow还是其他原因。

作为一项任务,我必须用C编写一个使用RMSE损失函数的简单线性回归。该程序是动态的,您可以输入自己的值并选择自己的损失函数,目前仅限于均方根误差。但首先是算法我使用的三卤甲烷:

现在代码…您需要gnuplot来显示图表,sudo apt安装gnuplot

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>

#define BUFFSIZE 64
#define MAXSIZE 100

static double vector_x[MAXSIZE] = {0};
static double vector_y[MAXSIZE] = {0};
static double vector_predict[MAXSIZE] = {0};

static double max_x;
static double max_y;
static double mean_x;
static double mean_y;
static double teta_0_intercept;
static double teta_1_grad;
static double RMSE;
static double r_square;
static double prediction;

static char intercept[BUFFSIZE];
static char grad[BUFFSIZE];
static char xrange[BUFFSIZE];
static char yrange[BUFFSIZE];
static char lossname_RMSE[BUFFSIZE] = "Simple Linear Regression using RMSE'";

static char cmd_gnu_0[BUFFSIZE] = "set title '";
static char cmd_gnu_1[BUFFSIZE] = "intercept = ";
static char cmd_gnu_2[BUFFSIZE] = "grad = ";
static char cmd_gnu_3[BUFFSIZE] = "set xrange [0:";
static char cmd_gnu_4[BUFFSIZE] = "set yrange [0:";
static char cmd_gnu_5[BUFFSIZE] = "f(x) = (grad * x) + intercept";
static char cmd_gnu_6[BUFFSIZE] = "plot f(x), 'data.temp' with points pointtype 7";

static char const *commands_gnuplot[] = {
    cmd_gnu_0,
    cmd_gnu_1,
    cmd_gnu_2,
    cmd_gnu_3,
    cmd_gnu_4,
    cmd_gnu_5,
    cmd_gnu_6,
};

static size_t size;

static void user_input()
{
    printf("Enter x,y vector size, MAX = 100\n");
    scanf("%lu", &size);
    if (size > MAXSIZE) {
        printf("Wrong input size is too big\n");
        user_input();
    }
    printf("vector's size is %lu\n", size);

    size_t i;
    for (i = 0; i < size; i++) {
        printf("Enter vector_x[%ld] values\n", i);
        scanf("%lf", &vector_x[i]);
    }

    for (i = 0; i < size; i++) {
        printf("Enter vector_y[%ld] values\n", i);
        scanf("%lf", &vector_y[i]);
    }
}

static void display_vector()
{
    size_t i;
    for (i = 0; i < size; i++){
        printf("vector_x[%lu] = %lf\t", i, vector_x[i]);
        printf("vector_y[%lu] = %lf\n", i, vector_y[i]);
    }
}

static void concatenate(char p[], char q[]) {
   int c;
   int d;
   c = 0;

   while (p[c] != '\0') {
      c++;
   }
   d = 0;

   while (q[d] != '\0') {
      p[c] = q[d];
      d++;
      c++;
   }
   p[c] = '\0';
}

static void compute_mean_x_y()
{
    size_t i;
    double tmp_x = 0.0;
    double tmp_y = 0.0;
    for (i = 0; i < size; i++) {
        tmp_x += vector_x[i];
        tmp_y += vector_y[i];
    }

    mean_x = tmp_x / size;
    mean_y = tmp_y / size;

    printf("mean_x = %lf\n", mean_x);
    printf("mean_y = %lf\n", mean_y);
}

static void compute_teta_1_grad()
{
    double numerator = 0.0;
    double denominator = 0.0;
    double tmp1 = 0.0;
    double tmp2 = 0.0;
    size_t i;

    for (i = 0; i < size; i++) {
        numerator += (vector_x[i] - mean_x) * (vector_y[i] - mean_y);
    }

    for (i = 0; i < size; i++) {
        tmp1 = vector_x[i] - mean_x;
        tmp2 = tmp1 * tmp1;
        denominator += tmp2;
    }

    teta_1_grad = numerator / denominator;
    printf("teta_1_grad = %lf\n", teta_1_grad);
}

static void compute_teta_0_intercept()
{
    teta_0_intercept = mean_y - (teta_1_grad * mean_x);
    printf("teta_0_intercept = %lf\n", teta_0_intercept);
}

static void compute_prediction()
{
    size_t i;
    for (i = 0; i < size; i++) {
        vector_predict[i] = teta_0_intercept + (teta_1_grad * vector_x[i]);
        printf("y^[%ld] = %lf\n", i, vector_predict[i]);
    }
    printf("\n");
}

static void compute_RMSE()
{
    compute_prediction();
    double error = 0;
    size_t i;
    for (i = 0; i < size; i++) {
        error = (vector_predict[i] - vector_y[i]) * (vector_predict[i] - vector_y[i]);
        printf("error y^[%ld] =  %lf\n", i, error);
        RMSE += error;
    }
    /* mean */
    RMSE = RMSE / size;
    /* square root mean */
    RMSE = sqrt(RMSE);
    printf("\nRMSE = %lf\n", RMSE);
}

static void compute_loss_function()
{
    int input = 0;
    printf("Which loss function do you want to use?\n");
    printf(" 1 - RMSE\n");
    scanf("%d", &input);
    switch(input) {
        case 1:
            concatenate(cmd_gnu_0, lossname_RMSE);
            compute_RMSE();
            printf("\n");
            break;
        default:
            printf("Wrong input try again\n");
            compute_loss_function(size);
    }
}

static void compute_r_square(size_t size)
{
    double num_err = 0.0;
    double den_err = 0.0;
    size_t i;

    for (i = 0; i < size; i++) {
        num_err += (vector_y[i] - vector_predict[i]) * (vector_y[i] - vector_predict[i]);
        den_err += (vector_y[i] - mean_y) * (vector_y[i] - mean_y);
    }
    r_square = 1 - (num_err/den_err);
    printf("R_square = %lf\n", r_square);
}

static void compute_predict_for_x()
{
    double x = 0.0;
    printf("Please enter x value\n");
    scanf("%lf", &x);
    prediction = teta_0_intercept + (teta_1_grad * x);
    printf("y^ if x = %lf -> %lf\n",x, prediction);
}

static void compute_max_x_y()
{
    size_t i;
    double tmp1= 0.0;
    double tmp2= 0.0;

    for (i = 0; i < size; i++) {
        if (vector_x[i] > tmp1) {
            tmp1 = vector_x[i];
            max_x = vector_x[i];

        }
        if (vector_y[i] > tmp2) {
            tmp2 = vector_y[i];
            max_y = vector_y[i];
        }
    }
    printf("vector_x max value %lf\n", max_x);
    printf("vector_y max value %lf\n", max_y);
}

static void display_model_line()
{
    sprintf(intercept, "%0.7lf", teta_0_intercept);
    sprintf(grad, "%0.7lf", teta_1_grad);
    sprintf(xrange, "%0.7lf", max_x + 1);
    sprintf(yrange, "%0.7lf", max_y + 1);

    concatenate(cmd_gnu_1, intercept);
    concatenate(cmd_gnu_2, grad);
    concatenate(cmd_gnu_3, xrange);
    concatenate(cmd_gnu_3, "]");
    concatenate(cmd_gnu_4, yrange);
    concatenate(cmd_gnu_4, "]");

    printf("grad = %s\n", grad);
    printf("intercept = %s\n", intercept);
    printf("xrange = %s\n", xrange);
    printf("yrange = %s\n", yrange);

    printf("cmd_gnu_0: %s\n", cmd_gnu_0);
    printf("cmd_gnu_1: %s\n", cmd_gnu_1);
    printf("cmd_gnu_2: %s\n", cmd_gnu_2);
    printf("cmd_gnu_3: %s\n", cmd_gnu_3);
    printf("cmd_gnu_4: %s\n", cmd_gnu_4);
    printf("cmd_gnu_5: %s\n", cmd_gnu_5);
    printf("cmd_gnu_6: %s\n", cmd_gnu_6);

    /* print plot */
    FILE *gnuplot_pipe = (FILE*)popen("gnuplot -persistent", "w");
    FILE *temp = (FILE*)fopen("data.temp", "w");

    /* create data.temp */
    size_t i;
    for (i = 0; i < size; i++)
    {
        fprintf(temp, "%f %f \n", vector_x[i], vector_y[i]);
    }

    /* display gnuplot */
    for (i = 0; i < 7; i++)
    {
        fprintf(gnuplot_pipe, "%s \n", commands_gnuplot[i]);
    }
}

int main(void)
{
    printf("===========================================\n");
    printf("INPUT DATA\n");
    printf("===========================================\n");
    user_input();
    display_vector();
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE MEAN X:Y, TETA_1 TETA_0\n");
    printf("===========================================\n");
    compute_mean_x_y();
    compute_max_x_y();
    compute_teta_1_grad();
    compute_teta_0_intercept();
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE LOSS FUNCTION\n");
    printf("===========================================\n");
    compute_loss_function();

    printf("===========================================\n");
    printf("COMPUTE R_square\n");
    printf("===========================================\n");
    compute_r_square(size);
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE y^ according to x\n");
    printf("===========================================\n");
    compute_predict_for_x();
    printf("\n");

    printf("===========================================\n");
    printf("DISPLAY LINEAR REGRESSION\n");
    printf("===========================================\n");
    display_model_line();
    printf("\n");

    return 0;
}
#包括
#包括
#包括
#包括
#包括
#定义大小64
#定义最大尺寸100
静态双向量_x[MAXSIZE]={0};
静态双向量_y[MAXSIZE]={0};
静态双向量_predict[MAXSIZE]={0};
静态双最大值x;
静态双最大值;
静态双平均值;
静态双平均;
静态双teta_0_截距;
静态双teta_1_梯度;
静态双RMSE;
静态双r_平方;
静态双重预测;
静态字符截距[BUFFSIZE];
静态字符梯度[BUFFSIZE];
静态字符xrange[BUFFSIZE];
静态字符范围[BUFFSIZE];
静态char lossname_RMSE[BUFFSIZE]=“使用RMSE的简单线性回归”;
静态字符cmd_gnu_0[BUFFSIZE]=“设置标题”;
静态字符cmd_gnu_1
struct Gauleg18 {
// Abscissas and weights for Gauss-Legendre quadrature.
   static const Int ngau = 18;
   static const Doub y[18];
   static const Doub w[18];
};

const Doub Gauleg18::y[18] = {0.0021695375159141994,
   0.011413521097787704,0.027972308950302116,0.051727015600492421,
   0.082502225484340941, 0.12007019910960293,0.16415283300752470,
   0.21442376986779355, 0.27051082840644336, 0.33199876341447887,
   0.39843234186401943, 0.46931971407375483, 0.54413605556657973,
   0.62232745288031077, 0.70331500465597174, 0.78649910768313447,
   0.87126389619061517, 0.95698180152629142};

const Doub Gauleg18::w[18] = {0.0055657196642445571,
   0.012915947284065419,0.020181515297735382,0.027298621498568734,
   0.034213810770299537,0.040875750923643261,0.047235083490265582,
   0.053244713977759692,0.058860144245324798,0.064039797355015485
   0.068745323835736408,0.072941885005653087,0.076598410645870640,
   0.079687828912071670,0.082187266704339706,0.084078218979661945,
   0.085346685739338721,0.085983275670394821};
double Gamma::invgammp(double p, double a) { 
    // Returns x such that P(a,x) =  p for an argument p between 0 and 1.
    int j;
    double x,err,t,u,pp,lna1,afac,a1=a-1;
    const double EPS=1.e-8; // Accuracy is the square of EPS.
    gln=gammln(a);
    if (a <= 0.) throw("a must be pos in invgammap");
    if (p >= 1.) return MAX(100.,a + 100.*sqrt(a));
    if (p <= 0.) return 0.0;
    if (a > 1.) {   
        lna1=log(a1);
        afac = exp(a1*(lna1-1.)-gln);
        pp = (p < 0.5)? p : 1. - p;
        t = sqrt(-2.*log(pp));
        x = (2.30753+t*0.27061)/(1.+t*(0.99229+t*0.04481)) - t;
        if (p < 0.5) x = -x;
        x = MAX(1.e-3,a*pow(1.-1./(9.*a)-x/(3.*sqrt(a)),3));
   } else {  
        t = 1.0 - a*(0.253+a*0.12); and (6.2.9).
        if (p < t) x = pow(p/t,1./a);
        else x = 1.-log(1.-(p-t)/(1.-t));
   }
   for (j=0;j<12;j++) {
        if (x <= 0.0) return 0.0; // x too small to compute accurately.
        err = gammp(a,x) - p;
        if (a > 1.) t = afac*exp(-(x-a1)+a1*(log(x)-lna1));
        else t = exp(-x+a1*log(x)-gln);
        u = err/t;
        // Halley’s method.
        x -= (t = u/(1.-0.5*MIN(1.,u*((a-1.)/x - 1)))); 
        // Halve old value if x tries to go negative.
        if (x <= 0.) x = 0.5*(x + t); 
        if (fabs(t) < EPS*x ) break;
    }
    return x;
}
double slope = ((n * sumxy) - (sumx * sumy )) / denom;
double intercept = ((sumy * sumx2) - (sumx * sumxy)) / denom;
double term1 = ((n * sumxy) - (sumx * sumy));
double term2 = ((n * sumx2) - (sumx * sumx));
double term3 = ((n * sumy2) - (sumy * sumy));
double term23 = (term2 * term3);
double r2 = 1.0;
if (fabs(term23) > MIN_DOUBLE)  // Define MIN_DOUBLE somewhere as 1e-9 or similar
    r2 = (term1 * term1) / term23;
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>

#define BUFFSIZE 64
#define MAXSIZE 100

static double vector_x[MAXSIZE] = {0};
static double vector_y[MAXSIZE] = {0};
static double vector_predict[MAXSIZE] = {0};

static double max_x;
static double max_y;
static double mean_x;
static double mean_y;
static double teta_0_intercept;
static double teta_1_grad;
static double RMSE;
static double r_square;
static double prediction;

static char intercept[BUFFSIZE];
static char grad[BUFFSIZE];
static char xrange[BUFFSIZE];
static char yrange[BUFFSIZE];
static char lossname_RMSE[BUFFSIZE] = "Simple Linear Regression using RMSE'";

static char cmd_gnu_0[BUFFSIZE] = "set title '";
static char cmd_gnu_1[BUFFSIZE] = "intercept = ";
static char cmd_gnu_2[BUFFSIZE] = "grad = ";
static char cmd_gnu_3[BUFFSIZE] = "set xrange [0:";
static char cmd_gnu_4[BUFFSIZE] = "set yrange [0:";
static char cmd_gnu_5[BUFFSIZE] = "f(x) = (grad * x) + intercept";
static char cmd_gnu_6[BUFFSIZE] = "plot f(x), 'data.temp' with points pointtype 7";

static char const *commands_gnuplot[] = {
    cmd_gnu_0,
    cmd_gnu_1,
    cmd_gnu_2,
    cmd_gnu_3,
    cmd_gnu_4,
    cmd_gnu_5,
    cmd_gnu_6,
};

static size_t size;

static void user_input()
{
    printf("Enter x,y vector size, MAX = 100\n");
    scanf("%lu", &size);
    if (size > MAXSIZE) {
        printf("Wrong input size is too big\n");
        user_input();
    }
    printf("vector's size is %lu\n", size);

    size_t i;
    for (i = 0; i < size; i++) {
        printf("Enter vector_x[%ld] values\n", i);
        scanf("%lf", &vector_x[i]);
    }

    for (i = 0; i < size; i++) {
        printf("Enter vector_y[%ld] values\n", i);
        scanf("%lf", &vector_y[i]);
    }
}

static void display_vector()
{
    size_t i;
    for (i = 0; i < size; i++){
        printf("vector_x[%lu] = %lf\t", i, vector_x[i]);
        printf("vector_y[%lu] = %lf\n", i, vector_y[i]);
    }
}

static void concatenate(char p[], char q[]) {
   int c;
   int d;
   c = 0;

   while (p[c] != '\0') {
      c++;
   }
   d = 0;

   while (q[d] != '\0') {
      p[c] = q[d];
      d++;
      c++;
   }
   p[c] = '\0';
}

static void compute_mean_x_y()
{
    size_t i;
    double tmp_x = 0.0;
    double tmp_y = 0.0;
    for (i = 0; i < size; i++) {
        tmp_x += vector_x[i];
        tmp_y += vector_y[i];
    }

    mean_x = tmp_x / size;
    mean_y = tmp_y / size;

    printf("mean_x = %lf\n", mean_x);
    printf("mean_y = %lf\n", mean_y);
}

static void compute_teta_1_grad()
{
    double numerator = 0.0;
    double denominator = 0.0;
    double tmp1 = 0.0;
    double tmp2 = 0.0;
    size_t i;

    for (i = 0; i < size; i++) {
        numerator += (vector_x[i] - mean_x) * (vector_y[i] - mean_y);
    }

    for (i = 0; i < size; i++) {
        tmp1 = vector_x[i] - mean_x;
        tmp2 = tmp1 * tmp1;
        denominator += tmp2;
    }

    teta_1_grad = numerator / denominator;
    printf("teta_1_grad = %lf\n", teta_1_grad);
}

static void compute_teta_0_intercept()
{
    teta_0_intercept = mean_y - (teta_1_grad * mean_x);
    printf("teta_0_intercept = %lf\n", teta_0_intercept);
}

static void compute_prediction()
{
    size_t i;
    for (i = 0; i < size; i++) {
        vector_predict[i] = teta_0_intercept + (teta_1_grad * vector_x[i]);
        printf("y^[%ld] = %lf\n", i, vector_predict[i]);
    }
    printf("\n");
}

static void compute_RMSE()
{
    compute_prediction();
    double error = 0;
    size_t i;
    for (i = 0; i < size; i++) {
        error = (vector_predict[i] - vector_y[i]) * (vector_predict[i] - vector_y[i]);
        printf("error y^[%ld] =  %lf\n", i, error);
        RMSE += error;
    }
    /* mean */
    RMSE = RMSE / size;
    /* square root mean */
    RMSE = sqrt(RMSE);
    printf("\nRMSE = %lf\n", RMSE);
}

static void compute_loss_function()
{
    int input = 0;
    printf("Which loss function do you want to use?\n");
    printf(" 1 - RMSE\n");
    scanf("%d", &input);
    switch(input) {
        case 1:
            concatenate(cmd_gnu_0, lossname_RMSE);
            compute_RMSE();
            printf("\n");
            break;
        default:
            printf("Wrong input try again\n");
            compute_loss_function(size);
    }
}

static void compute_r_square(size_t size)
{
    double num_err = 0.0;
    double den_err = 0.0;
    size_t i;

    for (i = 0; i < size; i++) {
        num_err += (vector_y[i] - vector_predict[i]) * (vector_y[i] - vector_predict[i]);
        den_err += (vector_y[i] - mean_y) * (vector_y[i] - mean_y);
    }
    r_square = 1 - (num_err/den_err);
    printf("R_square = %lf\n", r_square);
}

static void compute_predict_for_x()
{
    double x = 0.0;
    printf("Please enter x value\n");
    scanf("%lf", &x);
    prediction = teta_0_intercept + (teta_1_grad * x);
    printf("y^ if x = %lf -> %lf\n",x, prediction);
}

static void compute_max_x_y()
{
    size_t i;
    double tmp1= 0.0;
    double tmp2= 0.0;

    for (i = 0; i < size; i++) {
        if (vector_x[i] > tmp1) {
            tmp1 = vector_x[i];
            max_x = vector_x[i];

        }
        if (vector_y[i] > tmp2) {
            tmp2 = vector_y[i];
            max_y = vector_y[i];
        }
    }
    printf("vector_x max value %lf\n", max_x);
    printf("vector_y max value %lf\n", max_y);
}

static void display_model_line()
{
    sprintf(intercept, "%0.7lf", teta_0_intercept);
    sprintf(grad, "%0.7lf", teta_1_grad);
    sprintf(xrange, "%0.7lf", max_x + 1);
    sprintf(yrange, "%0.7lf", max_y + 1);

    concatenate(cmd_gnu_1, intercept);
    concatenate(cmd_gnu_2, grad);
    concatenate(cmd_gnu_3, xrange);
    concatenate(cmd_gnu_3, "]");
    concatenate(cmd_gnu_4, yrange);
    concatenate(cmd_gnu_4, "]");

    printf("grad = %s\n", grad);
    printf("intercept = %s\n", intercept);
    printf("xrange = %s\n", xrange);
    printf("yrange = %s\n", yrange);

    printf("cmd_gnu_0: %s\n", cmd_gnu_0);
    printf("cmd_gnu_1: %s\n", cmd_gnu_1);
    printf("cmd_gnu_2: %s\n", cmd_gnu_2);
    printf("cmd_gnu_3: %s\n", cmd_gnu_3);
    printf("cmd_gnu_4: %s\n", cmd_gnu_4);
    printf("cmd_gnu_5: %s\n", cmd_gnu_5);
    printf("cmd_gnu_6: %s\n", cmd_gnu_6);

    /* print plot */
    FILE *gnuplot_pipe = (FILE*)popen("gnuplot -persistent", "w");
    FILE *temp = (FILE*)fopen("data.temp", "w");

    /* create data.temp */
    size_t i;
    for (i = 0; i < size; i++)
    {
        fprintf(temp, "%f %f \n", vector_x[i], vector_y[i]);
    }

    /* display gnuplot */
    for (i = 0; i < 7; i++)
    {
        fprintf(gnuplot_pipe, "%s \n", commands_gnuplot[i]);
    }
}

int main(void)
{
    printf("===========================================\n");
    printf("INPUT DATA\n");
    printf("===========================================\n");
    user_input();
    display_vector();
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE MEAN X:Y, TETA_1 TETA_0\n");
    printf("===========================================\n");
    compute_mean_x_y();
    compute_max_x_y();
    compute_teta_1_grad();
    compute_teta_0_intercept();
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE LOSS FUNCTION\n");
    printf("===========================================\n");
    compute_loss_function();

    printf("===========================================\n");
    printf("COMPUTE R_square\n");
    printf("===========================================\n");
    compute_r_square(size);
    printf("\n");

    printf("===========================================\n");
    printf("COMPUTE y^ according to x\n");
    printf("===========================================\n");
    compute_predict_for_x();
    printf("\n");

    printf("===========================================\n");
    printf("DISPLAY LINEAR REGRESSION\n");
    printf("===========================================\n");
    display_model_line();
    printf("\n");

    return 0;
}