Warning: file_get_contents(/data/phpspider/zhask/data//catemap/4/r/83.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
C++ 我在c+中的逻辑回归实现出了什么问题+;?_C++_R_Armadillo_Logistic Regression - Fatal编程技术网

C++ 我在c+中的逻辑回归实现出了什么问题+;?

C++ 我在c+中的逻辑回归实现出了什么问题+;?,c++,r,armadillo,logistic-regression,C++,R,Armadillo,Logistic Regression,我使用犰狳线性代数库实现了一个带有IRLS算法的简单逻辑回归函数: #include <iostream> #include <string> #include <boost/math/distributions/normal.hpp> #include <boost/math/distributions/students_t.hpp> #include <armadillo> #include <cmath> using

我使用犰狳线性代数库实现了一个带有IRLS算法的简单逻辑回归函数:

#include <iostream>
#include <string>
#include <boost/math/distributions/normal.hpp>
#include <boost/math/distributions/students_t.hpp>
#include <armadillo>
#include <cmath>

using namespace boost::math;
arma::mat getW(
    arma::mat& beta,
    arma::mat& X,
    std::string family,
    std::string link
)
{
    arma::mat w;
    if(family == "poisson") {
        if(link == "identity") {
            w = arma::diagmat(1/(X * beta));
        }
    }
    else if(family == "binomial") {
        if(link == "logit") {
            arma::colvec tmp = exp(X * beta);
            w = arma::diagmat(tmp/pow(1+tmp, 2));
        }
    }
    else {
        throw 1;
    }
    return w;
}

arma::mat getz(
    arma::mat& y,
    arma::mat& beta,
    arma::mat& X,
    std::string family,
    std::string link
)
{
    arma::mat z;
    if(family == "poisson") {
        if(link == "identity") {
            z = y;
        }
    }
    else if(family == "binomial") {
        if(link=="logit") {
            arma::mat tmp = exp(X * beta);
            z = X*beta + y % (pow(1+tmp, 2)/tmp) - 1 - tmp;
        }
    }
    else {
        throw 1;
    }
    return z;
}

inline arma::mat glmMat(
    arma::mat& y,
    arma::mat& x,
    std::string family,
    std::string link
)
{

    int n = x.n_rows;
    int k = x.n_cols;

    // add a col of all ones
    arma::mat allOne(n, 1, arma::fill::ones);
    x.insert_cols(0, allOne);
    ++k;

    arma::mat res(k, 4);

    if(family=="binomial" and link=="logit")
    {
        arma::mat coef(k, 1, arma::fill::zeros);
        arma::mat W = getW(coef, x, family, link);
        arma::mat z = getz(y, coef, x, family, link);

        try {
            arma::mat J = x.t() * W * x;
            arma::colvec coef1 = arma::solve(J, x.t()*W*z);
            double coefdiff = max(abs(coef - coef1));
            while(coefdiff >= 0.00001) {
                coef = coef1;
                W = getW(coef, x, family, link);
                z = getz(y, coef, x, family, link);
                J = x.t() * W * x;
                coef1 = arma::solve(J, x.t()*W*z);
                coefdiff = max(abs(coef - coef1));
            }


            arma::mat coefVarMatrix = J.i();
            arma::colvec coefVar = coefVarMatrix.diag();
            arma::colvec coefSe = pow(coefVar, .5);
            arma::colvec zscore = coef / coefSe;
            res.col(0) = coef;
            res.col(1) = coefSe;
            res.col(2) = zscore;

            // calculate p values
            auto d = normal_distribution<>();
            for(int i=0; i<k; i++) {
                double p = 2 * (1 - cdf(d, fabs(res(i, 2))));
                if(p < 0 or p > 1) {
                    std::cerr << "Pval is abnormal from glm, dumping data to /tmp/tmpx.csv and /tmp/tmpy.csv" << std::endl;
                    x.save("/tmp/tmpx.csv", arma::csv_ascii);
                    y.save("/tmp/tmpy.csv", arma::csv_ascii);
                    throw 1;
                }
                res(i, 3) = p;
            }
        }
        catch(...) {
            std::cout << "something wrong..." << std::endl;
        }
    }
    else {
        throw 1;
    }
    return res;
}

int main(int argc, char const* argv[])
{
    {
        int nr = 5000;
        int ncx = 50;
        arma::mat x(nr, ncx, arma::fill::randu);
        arma::mat y = arma::randi<arma::mat>(nr, 1, arma::distr_param(0, 1));
        arma::mat xcol;
        arma::mat res(ncx, 4);
        for(int i=0; i<ncx; i++) {
            xcol = x(arma::span::all, i);
            res.row(i) = (glmMat(y, xcol, "binomial", "logit")).row(1);
        }
        res.print("res..........");
    }
    return 0;
}
主要功能模拟5000x50数据集,并对每个数据集执行逻辑回归,整个过程在我的笔记本电脑上大约需要23秒

在R中执行大致相同的操作,大约需要2秒:

testglm = function() {
        x = matrix(rnorm(5000*50), 5000)
        y = matrix(sample(0:1, 5000, repl=T), 5000)
        res = apply(x, 2, function(coli) summary(glm(y~coli, family=binomial))$coef[2, ])
        # print(res)
        }

system.time(testglm())
   user  system elapsed 
  2.049   0.000   2.049 

我想知道我的实现出了什么问题?

尝试使用
-O3
。我不知道R,但是它可能使用多线程,所以你应该在C++代码上做,尝试矢量化。-O3根本没有帮助。
testglm = function() {
        x = matrix(rnorm(5000*50), 5000)
        y = matrix(sample(0:1, 5000, repl=T), 5000)
        res = apply(x, 2, function(coli) summary(glm(y~coli, family=binomial))$coef[2, ])
        # print(res)
        }

system.time(testglm())
   user  system elapsed 
  2.049   0.000   2.049