Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 8066086

Browse filesBrowse files
authored
MNT Consolidate _incremental_weighted_mean_and_var into _incremental_mean_and_var (#19422)
1 parent 1ea7905 commit 8066086
Copy full SHA for 8066086

File tree

3 files changed

+47
-137
lines changed
Filter options

3 files changed

+47
-137
lines changed

‎sklearn/preprocessing/_data.py

Copy file name to clipboardExpand all lines: sklearn/preprocessing/_data.py
+3-9Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,7 @@
2222
from ..utils import check_array
2323
from ..utils.deprecation import deprecated
2424
from ..utils.extmath import row_norms
25-
from ..utils.extmath import (_incremental_mean_and_var,
26-
_incremental_weighted_mean_and_var)
25+
from ..utils.extmath import _incremental_mean_and_var
2726
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
2827
inplace_csr_row_normalize_l2)
2928
from ..utils.sparsefuncs import (inplace_column_scale,
@@ -838,16 +837,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
838837
self.var_ = None
839838
self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
840839

841-
elif sample_weight is not None:
842-
self.mean_, self.var_, self.n_samples_seen_ = \
843-
_incremental_weighted_mean_and_var(X, sample_weight,
844-
self.mean_,
845-
self.var_,
846-
self.n_samples_seen_)
847840
else:
848841
self.mean_, self.var_, self.n_samples_seen_ = \
849842
_incremental_mean_and_var(X, self.mean_, self.var_,
850-
self.n_samples_seen_)
843+
self.n_samples_seen_,
844+
sample_weight=sample_weight)
851845

852846
# for backward-compatibility, reduce n_samples_seen_ to an integer
853847
# if the number of samples is the same for each feature (i.e. no

‎sklearn/utils/extmath.py

Copy file name to clipboardExpand all lines: sklearn/utils/extmath.py
+29-110Lines changed: 29 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -690,113 +690,16 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
690690
return result
691691

692692

693-
def _incremental_weighted_mean_and_var(X, sample_weight,
694-
last_mean,
695-
last_variance,
696-
last_weight_sum):
697-
"""Calculate weighted mean and weighted variance incremental update.
698-
699-
.. versionadded:: 0.24
700-
701-
Parameters
702-
----------
703-
X : array-like of shape (n_samples, n_features)
704-
Data to use for mean and variance update.
705-
706-
sample_weight : array-like of shape (n_samples,) or None
707-
Sample weights. If None, then samples are equally weighted.
708-
709-
last_mean : array-like of shape (n_features,)
710-
Mean before the incremental update.
711-
712-
last_variance : array-like of shape (n_features,) or None
713-
Variance before the incremental update.
714-
If None, variance update is not computed (in case scaling is not
715-
required).
716-
717-
last_weight_sum : array-like of shape (n_features,)
718-
Sum of weights before the incremental update.
719-
720-
Returns
721-
-------
722-
updated_mean : array of shape (n_features,)
723-
724-
updated_variance : array of shape (n_features,) or None
725-
If None, only mean is computed.
726-
727-
updated_weight_sum : array of shape (n_features,)
728-
729-
Notes
730-
-----
731-
NaNs in `X` are ignored.
732-
733-
`last_mean` and `last_variance` are statistics computed at the last step
734-
by the function. Both must be initialized to 0.0.
735-
The mean is always required (`last_mean`) and returned (`updated_mean`),
736-
whereas the variance can be None (`last_variance` and `updated_variance`).
737-
738-
For further details on the algorithm to perform the computation in a
739-
numerically stable way, see [Finch2009]_, Sections 4 and 5.
740-
741-
References
742-
----------
743-
.. [Finch2009] `Tony Finch,
744-
"Incremental calculation of weighted mean and variance",
745-
University of Cambridge Computing Service, February 2009.
746-
<https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf>`_
747-
748-
"""
749-
# last = stats before the increment
750-
# new = the current increment
751-
# updated = the aggregated stats
752-
if sample_weight is None:
753-
return _incremental_mean_and_var(X, last_mean, last_variance,
754-
last_weight_sum)
755-
nan_mask = np.isnan(X)
756-
sample_weight_T = np.reshape(sample_weight, (1, -1))
757-
# new_weight_sum with shape (n_features,)
758-
new_weight_sum = np.dot(sample_weight_T,
759-
~nan_mask).ravel().astype(np.float64)
760-
total_weight_sum = _safe_accumulator_op(np.sum, sample_weight, axis=0)
761-
762-
X_0 = np.where(nan_mask, 0, X)
763-
new_mean = np.average(X_0,
764-
weights=sample_weight, axis=0).astype(np.float64)
765-
new_mean *= total_weight_sum / new_weight_sum
766-
updated_weight_sum = last_weight_sum + new_weight_sum
767-
updated_mean = (
768-
(last_weight_sum * last_mean + new_weight_sum * new_mean)
769-
/ updated_weight_sum)
770-
771-
if last_variance is None:
772-
updated_variance = None
773-
else:
774-
X_0 = np.where(nan_mask, 0, (X-new_mean)**2)
775-
new_variance =\
776-
_safe_accumulator_op(
777-
np.average, X_0, weights=sample_weight, axis=0)
778-
new_variance *= total_weight_sum / new_weight_sum
779-
new_term = (
780-
new_weight_sum *
781-
(new_variance +
782-
(new_mean - updated_mean) ** 2))
783-
last_term = (
784-
last_weight_sum *
785-
(last_variance +
786-
(last_mean - updated_mean) ** 2))
787-
updated_variance = (new_term + last_term) / updated_weight_sum
788-
789-
return updated_mean, updated_variance, updated_weight_sum
790-
791-
792-
def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
693+
def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
694+
sample_weight=None):
793695
"""Calculate mean update and a Youngs and Cramer variance update.
794696
795-
last_mean and last_variance are statistics computed at the last step by the
796-
function. Both must be initialized to 0.0. In case no scaling is required
797-
last_variance can be None. The mean is always required and returned because
798-
necessary for the calculation of the variance. last_n_samples_seen is the
799-
number of samples encountered until now.
697+
If sample_weight is given, the weighted mean and variance is computed.
698+
699+
Update a given mean and (possibly) variance according to new data given
700+
in X. last_mean is always required to compute the new mean.
701+
If last_variance is None, no variance is computed and None return for
702+
updated_variance.
800703
801704
From the paper "Algorithms for computing the sample variance: analysis and
802705
recommendations", by Chan, Golub, and LeVeque.
@@ -811,13 +714,19 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
811714
last_variance : array-like of shape (n_features,)
812715
813716
last_sample_count : array-like of shape (n_features,)
717+
The number of samples encountered until now if sample_weight is None.
718+
If sample_weight is not None, this is the sum of sample_weight
719+
encountered.
720+
721+
sample_weight : array-like of shape (n_samples,) or None
722+
Sample weights. If None, compute the unweighted mean/variance.
814723
815724
Returns
816725
-------
817726
updated_mean : ndarray of shape (n_features,)
818727
819728
updated_variance : ndarray of shape (n_features,)
820-
If None, only mean is computed.
729+
None if last_variance was None.
821730
822731
updated_sample_count : ndarray of shape (n_features,)
823732
@@ -839,18 +748,28 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
839748
# new = the current increment
840749
# updated = the aggregated stats
841750
last_sum = last_mean * last_sample_count
842-
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
751+
if sample_weight is not None:
752+
new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None],
753+
axis=0)
754+
new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)),
755+
axis=0)
756+
else:
757+
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
758+
new_sample_count = np.sum(~np.isnan(X), axis=0)
843759

844-
new_sample_count = np.sum(~np.isnan(X), axis=0)
845760
updated_sample_count = last_sample_count + new_sample_count
846761

847762
updated_mean = (last_sum + new_sum) / updated_sample_count
848763

849764
if last_variance is None:
850765
updated_variance = None
851766
else:
852-
new_unnormalized_variance = (
853-
_safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
767+
T = new_sum / new_sample_count
768+
if sample_weight is not None:
769+
new_unnormalized_variance = np.nansum(sample_weight[:, None] *
770+
(X - T)**2, axis=0)
771+
else:
772+
new_unnormalized_variance = np.nansum((X - T)**2, axis=0)
854773
last_unnormalized_variance = last_variance * last_sample_count
855774

856775
with np.errstate(divide='ignore', invalid='ignore'):

‎sklearn/utils/tests/test_extmath.py

Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_extmath.py
+15-18Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from sklearn.utils.extmath import log_logistic
3131
from sklearn.utils.extmath import svd_flip
3232
from sklearn.utils.extmath import _incremental_mean_and_var
33-
from sklearn.utils.extmath import _incremental_weighted_mean_and_var
3433
from sklearn.utils.extmath import _deterministic_vector_sign_flip
3534
from sklearn.utils.extmath import softmax
3635
from sklearn.utils.extmath import stable_cumsum
@@ -464,8 +463,8 @@ def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
464463
mult = 10
465464
X = rng.rand(1000, 20).astype(dtype)*mult
466465
sample_weight = rng.rand(X.shape[0]) * mult
467-
mean, var, _ = _incremental_weighted_mean_and_var(X, sample_weight,
468-
0, 0, 0)
466+
mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0,
467+
sample_weight=sample_weight)
469468

470469
expected_mean = np.average(X, weights=sample_weight, axis=0)
471470
expected_var = np.average(X**2, weights=sample_weight, axis=0) - \
@@ -488,11 +487,9 @@ def _assert(X, sample_weight, expected_mean, expected_var):
488487
last_mean, last_weight_sum, last_var = 0, 0, 0
489488
for batch in gen_batches(n, chunk_size):
490489
last_mean, last_var, last_weight_sum = \
491-
_incremental_weighted_mean_and_var(X[batch],
492-
sample_weight[batch],
493-
last_mean,
494-
last_var,
495-
last_weight_sum)
490+
_incremental_mean_and_var(
491+
X[batch], last_mean, last_var, last_weight_sum,
492+
sample_weight=sample_weight[batch])
496493
assert_allclose(last_mean, expected_mean)
497494
assert_allclose(last_var, expected_var, atol=1e-6)
498495

@@ -532,17 +529,17 @@ def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
532529
[300, 300, 300, np.nan]]).astype(dtype)
533530

534531
X_means, X_variances, X_count = \
535-
_incremental_weighted_mean_and_var(X,
536-
sample_weights_X,
537-
old_means,
538-
old_variances,
539-
old_weight_sum)
532+
_incremental_mean_and_var(X,
533+
old_means,
534+
old_variances,
535+
old_weight_sum,
536+
sample_weight=sample_weights_X)
540537
X_nan_means, X_nan_variances, X_nan_count = \
541-
_incremental_weighted_mean_and_var(X_nan,
542-
sample_weights_X_nan,
543-
old_means,
544-
old_variances,
545-
old_weight_sum)
538+
_incremental_mean_and_var(X_nan,
539+
old_means,
540+
old_variances,
541+
old_weight_sum,
542+
sample_weight=sample_weights_X_nan)
546543

547544
assert_allclose(X_nan_means, X_means)
548545
assert_allclose(X_nan_variances, X_variances)

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.