sqlserver中的多元线性回归函数

sqlserver中的多元线性回归函数,sql,sql-server,regression,Sql,Sql Server,Regression,我在SQLServer中开发了简单的线性回归函数,从这里计算Alpha、Beta和一些额外的值,如上95%和下95%。 简单线性回归以X和y为参数 现在我需要执行多元线性回归SQL Server,它接受参数y和X1,X2,X3,…Xn 因此,输出如下: Coefficients Standard Error t Stat P-value Lower 95% Upper 95% +------------------------------

我在SQLServer中开发了简单的线性回归函数,从这里计算Alpha、Beta和一些额外的值,如上95%和下95%。 简单线性回归以X和y为参数

现在我需要执行多元线性回归SQL Server,它接受参数y和X1,X2,X3,…Xn

因此,输出如下:

    Coefficients    Standard Error    t Stat         P-value        Lower 95%     Upper 95%
+-------------------------------------------------------------------------------------------+
    Intercept       -23.94650812     19.85250194     -1.20622117    0.351059563 -109.3649298    
    X Variable 1    0.201064291      0.119759437     1.678901439    0.235179    -0.314218977    
    X Variable 2    -0.014046021     0.037366638     -0.375897368   0.743119791 -0.174821687
    X Variable 3    0.502074905      0.295848189     1.697069389    0.231776287 -0.770857111
    X Variable 4    0.068238344      0.219256527     0.311226057    0.785072958 -0.875146351
任何人都可以向我推荐一个实现这一点的好方法。

例如,我将考虑使用来利用现有的支持线性回归的.NET库。使用CLR存储过程,您可以从表中读取数据,将其转换为.NET libraries矩阵类型,运行回归,然后将结果写回表或直接返回行集

但为了好玩,这里通过在SQL中使用解决了这个问题。警告将在任何大量数据上缓慢运行

-- Create a type to repsent a 2D Matrix

CREATE TYPE dbo.Matrix AS TABLE (i int, j int, Aij float, PRIMARY KEY (i, j))
GO

-- Function to perform QR factorisation ie A -> QR

CREATE FUNCTION dbo.QRDecomposition (
    @matrix dbo.Matrix READONLY
)
RETURNS @result TABLE (matrix char(1), i int, j int, Aij float)
AS
BEGIN

    DECLARE @m int, @n int, @i int, @j int, @a float

    SELECT @m = MAX(i), @n = MAX(j)
    FROM @matrix

    SET @i = 1
    SET @j = 1

    DECLARE @R dbo.Matrix
    DECLARE @Qj dbo.Matrix
    DECLARE @Q dbo.Matrix

    -- Generate a @m by @m Identity Matrix to transform to Q, add more numbers for m > 1000 
    ;WITH e1(n) AS
    (
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL 
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL 
        SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1 UNION ALL SELECT 1
    ),
    e2(n) AS (SELECT 1 FROM e1 CROSS JOIN e1 AS b),
    e3(n) AS (SELECT 1 FROM e1 CROSS JOIN e2),
    numbers(n) AS (SELECT ROW_NUMBER() OVER (ORDER BY n) FROM e3)
    INSERT INTO @Q (i, j, Aij)
    SELECT i.n, j.n, CASE WHEN i.n = j.n THEN 1 ELSE 0 END 
    FROM numbers i
    CROSS JOIN numbers j
    WHERE i.n <= @m AND j.n <= @m 

    -- Copy input matrix to be transformed to R
    INSERT @R (i, j, Aij)
    SELECT i, j, Aij
    FROM @matrix

    -- Loop performing Householder reflections
    WHILE @j < @n OR (@j = @n AND @m > @n)  BEGIN

        SELECT @a = SQRT(SUM(Aij * Aij))
        FROM @R
        WHERE j = @j
            AND i >= @i

        SELECT @a = -SIGN(Aij) * @a
        FROM @R
        WHERE j = @j AND i = @j + (@j - 1)

        ;WITH u (i, j, Aij) AS (
            SELECT i, 1, u.ui
            FROM (
                SELECT i, CASE WHEN i = j THEN Aij + @a ELSE Aij END AS ui
                FROM @R
                WHERE j = @j
                    AND i >= @i
            ) u
        )
        INSERT @Qj (i, j, Aij)
        SELECT i, j, CASE WHEN i = j THEN 1 - 2 * Aij ELSE - 2 * Aij END as Aij
        FROM (
            SELECT u.i, ut.i AS j, u.Aij * ut.Aij / (SELECT SUM(Aij * Aij) FROM u) AS Aij
            FROM u u
            CROSS JOIN u ut
        ) vvt

        -- Apply inverse Householder reflection to Q
        UPDATE Qj
            SET Aij = [Qj+1].Aij
        FROM @Q Qj
        INNER JOIN ( 
            SELECT Q.i, QjT.j, SUM(QjT.Aij * Q.Aij) AS Aij
            FROM @Q Q
            INNER JOIN (
                SELECT i AS j, j AS i, Aij
                FROM @Qj
            ) QjT ON QjT.i = Q.j 
            GROUP BY Q.i, QjT.j
        ) [Qj+1] ON [Qj+1].i = Qj.i AND [Qj+1].j = Qj.j

        -- Apply Householder reflections to R
        UPDATE Rj
            SET Aij = [Rj+1].Aij
        FROM @R Rj
        INNER JOIN ( 
            SELECT Qj.i, R.j, SUM(Qj.Aij * R.Aij) AS Aij
            FROM @Qj Qj
            INNER JOIN @R R ON R.i = Qj.j 
            GROUP BY Qj.i, R.j
        ) [Rj+1] ON [Rj+1].i = Rj.i AND [Rj+1].j = Rj.j

        -- Prepare Qj for next Householder reflection
        UPDATE @Qj
            SET Aij = CASE WHEN i = j THEN 1 ELSE 0 END
        WHERE i <= @j OR j <= @j

        DELETE FROM @Qj WHERE i > @j AND j > @j

        SET @j = @j + 1
        SET @i = @i + 1

    END

    -- Output Q
    INSERT @result (matrix, i, j, Aij)
    SELECT 'Q', i, j, Aij
    FROM @Q

    -- Output R
    INSERT @result (matrix, i, j, Aij)
    SELECT 'R', i, j, Aij
    FROM @R

    RETURN

END 
GO

-- Function to perform linear regression

CREATE FUNCTION dbo.MatrixLeastSquareRegression (
    @X dbo.Matrix READONLY
    , @y dbo.Matrix READONLY
)
RETURNS @b TABLE (i int, j int, Aij float)
AS
BEGIN

    DECLARE @QR TABLE (matrix char(1), i int, j int, Aij float)

    INSERT @QR(matrix, i, j, Aij)
    SELECT matrix, i, j, Aij
    FROM dbo.QRDecomposition(@X)

    DECLARE @Qty dbo.Matrix

    -- @Qty = Q'y
    INSERT INTO @Qty(i, j, Aij)
    SELECT a.j, b.j, SUM(a.Aij * b.Aij)
    FROM @QR a
    INNER JOIN @y b ON b.i = a.i
    WHERE a.matrix = 'Q'
    GROUP BY a.j, b.j

    DECLARE @m int, @n int, @i int, @j int, @a float

    SELECT @m = MAX(j)
    FROM @QR R
    WHERE R.matrix = 'R'

    SET @i = @m

    -- Solve Rb = Q'y via back substitution

    WHILE @i > 0 BEGIN

        INSERT @b (i, j, Aij)
        SELECT R.i, 1, ( y.Aij - ISNULL(sumKnown.Aij, 0) ) / R.Aij
        FROM @QR R
        INNER JOIN @Qty y ON y.i = R.i
        LEFT JOIN (
            SELECT SUM(R.Aij * ISNULL(b.Aij, 0)) AS Aij
            FROM @QR R
            INNER JOIN @b b ON b.i = R.j
            WHERE R.matrix = 'R' 
                AND R.i = @i
        ) sumKnown ON 1 = 1
        WHERE R.matrix = 'R' 
            AND R.i = @i
            AND R.j = @i

        SET @i = @i - 1

    END

    RETURN

END 
GO
下面是一个测试脚本/用法示例:

DECLARE @TestData TABLE (i int IDENTITY(1, 1), X1 float, X2 float, X3 float, X4 float, y float)

DECLARE @c float
DECLARE @b1 float
DECLARE @b2 float
DECLARE @b3 float
DECLARE @b4 float

-- bs are the target coefficiants

SET @c = RAND()
SET @b1 = 2 * RAND()
SET @b2 = 3 * RAND()
SET @b3 = 4 * RAND()
SET @b4 = 5 * RAND()

-- Generate some test data, calcualte y from c + Xb plus some noise: y = c + Xb + e
-- Note: Using RAND() for e is not nomrally ditributed noise as linear regression assumes, this will mess with the estimate of c

DECLARE @k int = 1

WHILE @k < 50 BEGIN

    INSERT @TestData(X1, X2, X3, X4, y)
    SELECT x1, x2, x3, x4, @c + x1 * @b1 + x2 * @b2 + x3 * @b3 + x4 * @b4 + 0.2 * RAND()
    FROM (
        SELECT RAND() AS x1, RAND() AS x2, RAND() AS x3, RAND() AS x4
    ) X

    SET @k = @k + 1

END

-- Put our data into dbo.Matrix types

DECLARE @X dbo.Matrix

INSERT @X (i, j, Aij)
-- Extra column for constant
SELECT i, 1, 1
FROM @TestData
UNION
SELECT i, 2, X1
FROM @TestData
UNION
SELECT i, 3, X2
FROM @TestData
UNION
SELECT i, 4, X3
FROM @TestData
UNION
SELECT i, 5, X4
FROM @TestData

DECLARE @y dbo.Matrix

INSERT @y (i, j, Aij)
SELECT i, 1, y
FROM @TestData

-- Estimates for coefficient values
DECLARE @bhat dbo.Matrix

INSERT @bhat (i, j, Aij)
SELECT i, j, Aij
FROM dbo.MatrixLeastSquareRegression(@X, @y)

SELECT CASE i
        WHEN 1 THEN @c
        WHEN 2 THEN @b1
        WHEN 3 THEN @b2
        WHEN 4 THEN @b3
        WHEN 5 THEN @b4
    END AS b
    , Aij AS best 
FROM @bhat

SELECT y.Aij AS y, Xb.Aij AS yest
FROM (
    SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
    FROM @X x
    INNER JOIN @bhat bh ON bh.i = x.j
    GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i

SELECT SUM(SQUARE(y.Aij - Xb.Aij)) / COUNT(*) AS [Variance] 
FROM (
    SELECT x.i, SUM(x.Aij * bh.Aij) AS Aij
    FROM @X x
    INNER JOIN @bhat bh ON bh.i = x.j
    GROUP BY x.i
) Xb
INNER JOIN @y y ON y.i = Xb.i

虽然我对编写能够执行各种高级统计计算的纯SQL函数表示赞赏,但SQL并不是解决这类问题的最佳语言

正如David Manning所建议的,CLR绝对是一个选项,与纯SQL相比,它很可能在处理这个特定问题时表现得更好

另一种方法是使用统计语言。我推荐你。它具有用于向SQL Server读取和写入数据的内置包,以及用于执行各种回归的多种函数。最棒的是:它是免费的!开始使用R并对SQL Server 2012中的数据执行统计分析。

尽管数据挖掘算法自然是一种适用于线性回归的描述树,但为什么不使用analysis Services中的数据挖掘算法呢?你只需要为它设计一个正确的挖掘

提示:不需要OLAP多维数据集,您可以从关系表/视图设计它

AnalysisServices在功能上包括在SQL Server标准版和更高版本中
在处理完模型后,您可以使用类似SQL的语言来检索回归函数、方差和其他有用的东西

您是否可以通过一些示例数据进行详细说明?您传递的输入和输出是什么?Thanksy您的链接解释了如何计算每个属性或每个数据集的一些外部键。我认为,在这里,无论是从数学角度还是从集合角度,我们可能无法理解的是:X1,…Xn相对于每个数据集的意义是什么?我同意@JaazCole,清楚地说明输入是如何以y和X1,X2,X3,…Xn的形式出现的。@KumarHarsh我不清楚如何实现它。你能给我一个关于itAgree 100%的建议吗。有些问题不适合SQL。这绝对是其中之一。作为使用CLR的替代方法,我建议您只需使用您选择的语言编写一个小型应用程序。我会推荐一种统计语言,比如,它内置了大量的回归功能——而且它很容易连接到SQL数据进行读写,最重要的是:它是免费的!您能否扩展您的解决方案以获得R2和p值?