Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/358.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 用scikit学习线性模型约束系数和_Python_Machine Learning_Scikit Learn_Regression_Sklearn Pandas - Fatal编程技术网

Python 用scikit学习线性模型约束系数和

Python 用scikit学习线性模型约束系数和,python,machine-learning,scikit-learn,regression,sklearn-pandas,Python,Machine Learning,Scikit Learn,Regression,Sklearn Pandas,我正在做一个有1000个COEF的拉索夫。Statsmodels似乎无法处理如此多的COEF。所以我使用scikit学习。允许.fit_约束的Statsmodel(“coef1+coef2…=1”)。这将coefs之和限制为=1。我需要在Scikit中执行此操作。我还将截距保持在零 from sklearn.linear_model import LassoCV LassoCVmodel = LassoCV(fit_intercept=False) LassoCVmodel.fit(x,y)

我正在做一个有1000个COEF的拉索夫。Statsmodels似乎无法处理如此多的COEF。所以我使用scikit学习。允许.fit_约束的Statsmodel(“coef1+coef2…=1”)。这将coefs之和限制为=1。我需要在Scikit中执行此操作。我还将截距保持在零

from sklearn.linear_model import LassoCV

LassoCVmodel = LassoCV(fit_intercept=False)
LassoCVmodel.fit(x,y)

任何帮助都将不胜感激

如评论中所述:文档和资料来源未表明sklearn支持此功能

我只是尝试了使用现成的凸优化解算器。这只是一个简单的原型方法,可能不适合您(未完全定义)的任务(样本量?)

一些评论:

  • 实施/模型制定很容易
  • 这个问题比我想象的更难解决
    • 解算器ECOS有一般性问题
    • 解算器SCS达到良好精度(比sklearn差)
    • 但是:调整迭代以提高精度会破坏解算器
      • 这个问题对于SCS来说是不可行的
    • 基于SCS+bigM的公式(约束被张贴为目标内的惩罚条款)看起来可用;但可能需要调整
    • 只有开源解算器经过测试,商业解算器可能更好
进一步的尝试:

  • 为了解决巨大的问题(性能比鲁棒性和准确性更重要),一种(加速的)投影随机梯度方法看起来很有希望
代码 输出 编辑

只是为了好玩,我使用加速投影梯度的方法实现了一个缓慢的非优化原型解算器(代码中有注释!)

尽管这里的行为很慢(因为没有优化),但对于巨大的问题(因为这是一个一阶方法),这个方法应该扩展得更好。应该有很大的潜力

警告:对某些人来说可能被视为高级数值优化:-)

编辑2:我忘了在投影上添加非负约束(如果x可以是非负的,那么sum(x)==1没有多大意义!)。这使得求解变得更加困难(数值问题),很明显,应该使用一个快速的专用投影(我现在太懒了;我认为n*logn ALG是可用的)。再次说明:这个APG解算器是一个原型,还没有为实际任务做好准备

代码
我很惊讶之前没有人在评论中提到这一点,但我认为你的问题陈述中存在概念上的误解

让我们从套索估计量的定义开始,例如,如Hastine、Tibshirani和Wainwright在《统计学习与稀疏套索和推广》中给出的:

给定N个预测器响应对{(xi,yi)}的集合
不支持的套索查找。您可能需要从头开始编写一些东西,修改sklearn或使用其他库。我可以使用sklearn.model_selection.GridSearchCV和fit_params吗?不。这没有任何意义。好的,谢谢@Saschaha如果这项任务对您很重要,您可以在cvxpy中轻松地表述此问题,并且解决应该足够快(虽然你没有说明你得到了多少样品)。哇,这非常有用。谢谢你花时间。
""" data """
from time import perf_counter as pc
import numpy as np
from sklearn import datasets
diabetes = datasets.load_diabetes()
A = diabetes.data
y = diabetes.target
alpha=0.1

print('Problem-size: ', A.shape)

def obj(x):  # following sklearn's definition from user-guide!
    return (1. / (2*A.shape[0])) * np.square(np.linalg.norm(A.dot(x) - y, 2)) + alpha * np.linalg.norm(x, 1)


""" sklearn """
print('\nsklearn classic l1')
from sklearn import linear_model
clf = linear_model.Lasso(alpha=alpha, fit_intercept=False)
t0 = pc()
clf.fit(A, y)
print('used (secs): ', pc() - t0)
print(obj(clf.coef_))
print('sum x: ', np.sum(clf.coef_))

""" cvxpy """
print('\ncvxpy + scs classic l1')
from cvxpy import *
x = Variable(A.shape[1])
objective = Minimize((1. / (2*A.shape[0])) * sum_squares(A*x - y) + alpha * norm(x, 1))
problem = Problem(objective, [])
t0 = pc()
problem.solve(solver=SCS, use_indirect=False, max_iters=10000, verbose=False)
print('used (secs): ', pc() - t0)
print(obj(x.value.flat))
print('sum x: ', np.sum(x.value.flat))

""" cvxpy -> sum x == 1 """
print('\ncvxpy + scs sum == 1 / 1st approach')
objective = Minimize((1. / (2*A.shape[0])) * sum_squares(A*x - y))
constraints = [sum(x) == 1]
problem = Problem(objective, constraints)
t0 = pc()
problem.solve(solver=SCS, use_indirect=False, max_iters=10000, verbose=False)
print('used (secs): ', pc() - t0)
print(obj(x.value.flat))
print('sum x: ', np.sum(x.value.flat))

""" cvxpy approach 2 -> sum x == 1 """
print('\ncvxpy + scs sum == 1 / 2nd approach')
M = 1e6
objective = Minimize((1. / (2*A.shape[0])) * sum_squares(A*x - y) + M*(sum(x) - 1))
constraints = [sum(x) == 1]
problem = Problem(objective, constraints)
t0 = pc()
problem.solve(solver=SCS, use_indirect=False, max_iters=10000, verbose=False)
print('used (secs): ', pc() - t0)
print(obj(x.value.flat))
print('sum x: ', np.sum(x.value.flat))
Problem-size:  (442, 10)

sklearn classic l1
used (secs):  0.001451024380348898
13201.3508496
sum x:  891.78869298

cvxpy + scs classic l1
used (secs):  0.011165673357417458
13203.6549995
sum x:  872.520510561

cvxpy + scs sum == 1 / 1st approach
used (secs):  0.15350853891775978
13400.1272148
sum x:  -8.43795102327

cvxpy + scs sum == 1 / 2nd approach
used (secs):  0.012579569383536493
13397.2932976
sum x:  1.01207061047
""" accelerated pg  -> sum x == 1 """
def solve_pg(A, b, momentum=0.9, maxiter=1000):
    """ remarks:
            algorithm: accelerated projected gradient
            projection: proj on probability-simplex
                -> naive and slow using cvxpy + ecos
            line-search: armijo-rule along projection-arc (Bertsekas book)
                -> suffers from slow projection
            stopping-criterion: naive
            gradient-calculation: precomputes AtA
                -> not needed and not recommended for huge sparse data!
    """

    M, N = A.shape
    x = np.zeros(N)

    AtA = A.T.dot(A)
    Atb = A.T.dot(b)

    stop_count = 0

    # projection helper
    x_ = Variable(N)
    v_ = Parameter(N)
    objective_ =  Minimize(0.5 * square(norm(x_ - v_, 2)))
    constraints_ = [sum(x_) == 1]
    problem_ = Problem(objective_, constraints_)

    def gradient(x):
        return AtA.dot(x) - Atb

    def obj(x):
        return 0.5 * np.linalg.norm(A.dot(x) - b)**2

    it = 0
    while True:
        grad = gradient(x)

        # line search
        alpha = 1
        beta = 0.5
        sigma=1e-2
        old_obj = obj(x)
        while True:
            new_x = x - alpha * grad
            new_obj = obj(new_x)
            if old_obj - new_obj >= sigma * grad.dot(x - new_x):
                break
            else:
                alpha *= beta

        x_old = x[:]
        x = x - alpha*grad

        # projection
        v_.value = x
        problem_.solve()
        x = np.array(x_.value.flat)

        y = x + momentum * (x - x_old)

        if np.abs(old_obj - obj(x)) < 1e-2:
            stop_count += 1
        else:
            stop_count = 0

        if stop_count == 3:
            print('early-stopping @ it: ', it)
            return x

        it += 1

        if it == maxiter:
            return x


print('\n acc pg')
t0 = pc()
x = solve_pg(A, y)
print('used (secs): ', pc() - t0)
print(obj(x))
print('sum x: ', np.sum(x))
acc pg
early-stopping @ it:  367
used (secs):  0.7714511330487027
13396.8642379
sum x:  1.00000000002