@@ -11,23 +11,113 @@ def __init__(self, fit_intercept=True):
11
11
12
12
Notes
13
13
-----
14
- Given data matrix *X* and target vector *y* , the maximum-likelihood estimate
15
- for the regression coefficients, :math:`\ \beta`, is:
14
+ Given data matrix **X** and target vector **y** , the maximum-likelihood
15
+ estimate for the regression coefficients, :math:`\beta`, is:
16
16
17
17
.. math::
18
18
19
- \hat{\beta} =
20
- \left(\mathbf{X}^\top \mathbf{X}\right)^{-1} \mathbf{X}^\top \mathbf{y}
19
+ \hat{\beta} = \Sigma^{-1} \mathbf{X}^\top \mathbf{y}
20
+
21
+ where :math:`\Sigma^{-1} = (\mathbf{X}^\top \mathbf{X})^{-1}`.
21
22
22
23
Parameters
23
24
----------
24
25
fit_intercept : bool
25
- Whether to fit an additional intercept term in addition to the
26
- model coefficients. Default is True.
26
+ Whether to fit an intercept term in addition to the model
27
+ coefficients. Default is True.
27
28
"""
28
29
self .beta = None
30
+ self .sigma_inv = None
29
31
self .fit_intercept = fit_intercept
30
32
33
+ self ._is_fit = False
34
+
35
+ def update (self , X , y ):
36
+ r"""
37
+ Incrementally update the least-squares coefficients for a set of new
38
+ examples.
39
+
40
+ Notes
41
+ -----
42
+ The recursive least-squares algorithm [1]_ [2]_ is used to efficiently
43
+ update the regression parameters as new examples become available. For
44
+ a single new example :math:`(\mathbf{x}_{t+1}, \mathbf{y}_{t+1})`, the
45
+ parameter updates are
46
+
47
+ .. math::
48
+
49
+ \beta_{t+1} = \left(
50
+ \mathbf{X}_{1:t}^\top \mathbf{X}_{1:t} +
51
+ \mathbf{x}_{t+1}\mathbf{x}_{t+1}^\top \right)^{-1}
52
+ \mathbf{X}_{1:t}^\top \mathbf{Y}_{1:t} +
53
+ \mathbf{x}_{t+1}^\top \mathbf{y}_{t+1}
54
+
55
+ where :math:`\beta_{t+1}` are the updated regression coefficients,
56
+ :math:`\mathbf{X}_{1:t}` and :math:`\mathbf{Y}_{1:t}` are the set of
57
+ examples observed from timestep 1 to *t*.
58
+
59
+ In the single-example case, the RLS algorithm uses the Sherman-Morrison
60
+ formula [3]_ to avoid re-inverting the covariance matrix on each new
61
+ update. In the multi-example case (i.e., where :math:`\mathbf{X}_{t+1}`
62
+ and :math:`\mathbf{y}_{t+1}` are matrices of `N` examples each), we use
63
+ the generalized Woodbury matrix identity [4]_ to update the inverse
64
+ covariance. This comes at a performance cost, but is still more
65
+ performant than doing multiple single-example updates if *N* is large.
66
+
67
+ References
68
+ ----------
69
+ .. [1] Gauss, C. F. (1821) _Theoria combinationis observationum
70
+ erroribus minimis obnoxiae_, Werke, 4. Gottinge
71
+ .. [2] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
72
+ .. [3] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
73
+ .. [4] https://en.wikipedia.org/wiki/Woodbury_matrix_identity
74
+
75
+ Parameters
76
+ ----------
77
+ X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`
78
+ A dataset consisting of `N` examples, each of dimension `M`
79
+ y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, K)`
80
+ The targets for each of the `N` examples in `X`, where each target
81
+ has dimension `K`
82
+ """
83
+ if not self ._is_fit :
84
+ raise RuntimeError ("You must call the `fit` method before calling `update`" )
85
+
86
+ X , y = np .atleast_2d (X ), np .atleast_2d (y )
87
+
88
+ X1 , Y1 = X .shape [0 ], y .shape [0 ]
89
+ self ._update1D (X , y ) if X1 == Y1 == 1 else self ._update2D (X , y )
90
+
91
+ def _update1D (self , x , y ):
92
+ """Sherman-Morrison update for a single example"""
93
+ beta , S_inv = self .beta , self .sigma_inv
94
+
95
+ # convert x to a design vector if we're fitting an intercept
96
+ if self .fit_intercept :
97
+ x = np .c_ [1 , x ]
98
+
99
+ # update the inverse of the covariance matrix via Sherman-Morrison
100
+ S_inv -= (S_inv @ x .T @ x @ S_inv ) / (1 + x @ S_inv @ x .T )
101
+
102
+ # update the model coefficients
103
+ beta += S_inv @ x .T @ (y - x @ beta )
104
+
105
+ def _update2D (self , X , y ):
106
+ """Woodbury update for multiple examples"""
107
+ beta , S_inv = self .beta , self .sigma_inv
108
+
109
+ # convert X to a design matrix if we're fitting an intercept
110
+ if self .fit_intercept :
111
+ X = np .c_ [np .ones (X .shape [0 ]), X ]
112
+
113
+ I = np .eye (X .shape [0 ])
114
+
115
+ # update the inverse of the covariance matrix via Woodbury identity
116
+ S_inv -= S_inv @ X .T @ np .linalg .pinv (I + X @ S_inv @ X .T ) @ X @ S_inv
117
+
118
+ # update the model coefficients
119
+ beta += S_inv @ X .T @ (y - X @ beta )
120
+
31
121
def fit (self , X , y ):
32
122
"""
33
123
Fit the regression coefficients via maximum likelihood.
@@ -44,8 +134,10 @@ def fit(self, X, y):
44
134
if self .fit_intercept :
45
135
X = np .c_ [np .ones (X .shape [0 ]), X ]
46
136
47
- pseudo_inverse = np .linalg .inv (X .T @ X ) @ X .T
48
- self .beta = np .dot (pseudo_inverse , y )
137
+ self .sigma_inv = np .linalg .pinv (X .T @ X )
138
+ self .beta = np .atleast_2d (self .sigma_inv @ X .T @ y )
139
+
140
+ self ._is_fit = True
49
141
50
142
def predict (self , X ):
51
143
"""
@@ -166,22 +258,22 @@ def __init__(self, penalty="l2", gamma=0, fit_intercept=True):
166
258
\left(
167
259
\sum_{i=0}^N y_i \log(\hat{y}_i) +
168
260
(1-y_i) \log(1-\hat{y}_i)
169
- \right) - R(\mathbf{b}, \gamma)
261
+ \right) - R(\mathbf{b}, \gamma)
170
262
\right]
171
-
263
+
172
264
where
173
-
265
+
174
266
.. math::
175
-
267
+
176
268
R(\mathbf{b}, \gamma) = \left\{
177
269
\begin{array}{lr}
178
270
\frac{\gamma}{2} ||\mathbf{beta}||_2^2 & :\texttt{ penalty = 'l2'}\\
179
271
\gamma ||\beta||_1 & :\texttt{ penalty = 'l1'}
180
272
\end{array}
181
273
\right.
182
-
183
- is a regularization penalty, :math:`\gamma` is a regularization weight,
184
- `N` is the number of examples in **y**, and **b** is the vector of model
274
+
275
+ is a regularization penalty, :math:`\gamma` is a regularization weight,
276
+ `N` is the number of examples in **y**, and **b** is the vector of model
185
277
coefficients.
186
278
187
279
Parameters
@@ -251,10 +343,10 @@ def _NLL(self, X, y, y_pred):
251
343
\right]
252
344
"""
253
345
N , M = X .shape
254
- beta , gamma = self .beta , self .gamma
346
+ beta , gamma = self .beta , self .gamma
255
347
order = 2 if self .penalty == "l2" else 1
256
348
norm_beta = np .linalg .norm (beta , ord = order )
257
-
349
+
258
350
nll = - np .log (y_pred [y == 1 ]).sum () - np .log (1 - y_pred [y == 0 ]).sum ()
259
351
penalty = (gamma / 2 ) * norm_beta ** 2 if order == 2 else gamma * norm_beta
260
352
return (penalty + nll ) / N
0 commit comments