Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Consolidate _incremental_weighted_mean_and_var into _incremental_mean_and_var (Issue18573) #19422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Feb 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
bab1e0c
added the incremental variance for sample_weight
maikia Oct 1, 2020
6989bbd
Drop incorrect import
norbusan Feb 9, 2021
4c36f55
Update tests
norbusan Feb 9, 2021
fb090ff
Allow testing different code path
norbusan Feb 9, 2021
192534f
reactivate old function for speed comparison
norbusan Feb 9, 2021
44c3647
fix typo
norbusan Feb 9, 2021
66a28b4
Add TODO about speed comparison
norbusan Feb 9, 2021
dddc934
My English ....
norbusan Feb 9, 2021
b6ea134
Fix: linter error
nuka137 Feb 10, 2021
255545c
Remove _incremental_weighted_mean_and_var
nuka137 Feb 10, 2021
50375b2
Drop variant, it is too slow, better to keep separate code path
norbusan Feb 10, 2021
66790e5
ignore nan in sum
norbusan Feb 10, 2021
3b38dbd
Remove TODO comment, fixed.
norbusan Feb 10, 2021
9af1b6f
Reduce diff to main, don't add newlines
norbusan Feb 10, 2021
b02aa6f
Tweak
nuka137 Feb 10, 2021
204d93c
Document new parameter
norbusan Feb 10, 2021
3ced1f3
Fix the documentation
nuka137 Feb 10, 2021
12dcd38
Update documentation of the function
norbusan Feb 10, 2021
1b9033f
Fix: Address requests from the review
nuka137 Feb 10, 2021
9af607d
Fix: lint error
nuka137 Feb 10, 2021
63d4804
Apply the commit suggestion
nuka137 Feb 10, 2021
97b3e0e
Apply the commit suggestion
nuka137 Feb 10, 2021
3e2e7ca
Apply the commit suggestion
nuka137 Feb 10, 2021
c32d147
Apply the commit suggestion
nuka137 Feb 10, 2021
df475a8
Revert "Apply the commit suggestion"
nuka137 Feb 10, 2021
0ddfba6
Revert "Apply the commit suggestion"
nuka137 Feb 10, 2021
db0a1c4
Revert "Apply the commit suggestion"
nuka137 Feb 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions 12 sklearn/preprocessing/_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@
from ..utils import check_array
from ..utils.deprecation import deprecated
from ..utils.extmath import row_norms
from ..utils.extmath import (_incremental_mean_and_var,
_incremental_weighted_mean_and_var)
from ..utils.extmath import _incremental_mean_and_var
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale,
Expand Down Expand Up @@ -838,16 +837,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
self.var_ = None
self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)

elif sample_weight is not None:
self.mean_, self.var_, self.n_samples_seen_ = \
_incremental_weighted_mean_and_var(X, sample_weight,
self.mean_,
self.var_,
self.n_samples_seen_)
else:
self.mean_, self.var_, self.n_samples_seen_ = \
_incremental_mean_and_var(X, self.mean_, self.var_,
self.n_samples_seen_)
self.n_samples_seen_,
sample_weight=sample_weight)

# for backward-compatibility, reduce n_samples_seen_ to an integer
# if the number of samples is the same for each feature (i.e. no
Expand Down
139 changes: 29 additions & 110 deletions 139 sklearn/utils/extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,113 +690,16 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
return result


def _incremental_weighted_mean_and_var(X, sample_weight,
last_mean,
last_variance,
last_weight_sum):
"""Calculate weighted mean and weighted variance incremental update.

.. versionadded:: 0.24

Parameters
----------
X : array-like of shape (n_samples, n_features)
Data to use for mean and variance update.

sample_weight : array-like of shape (n_samples,) or None
Sample weights. If None, then samples are equally weighted.

last_mean : array-like of shape (n_features,)
Mean before the incremental update.

last_variance : array-like of shape (n_features,) or None
Variance before the incremental update.
If None, variance update is not computed (in case scaling is not
required).

last_weight_sum : array-like of shape (n_features,)
Sum of weights before the incremental update.

Returns
-------
updated_mean : array of shape (n_features,)

updated_variance : array of shape (n_features,) or None
If None, only mean is computed.

updated_weight_sum : array of shape (n_features,)

Notes
-----
NaNs in `X` are ignored.

`last_mean` and `last_variance` are statistics computed at the last step
by the function. Both must be initialized to 0.0.
The mean is always required (`last_mean`) and returned (`updated_mean`),
whereas the variance can be None (`last_variance` and `updated_variance`).

For further details on the algorithm to perform the computation in a
numerically stable way, see [Finch2009]_, Sections 4 and 5.

References
----------
.. [Finch2009] `Tony Finch,
"Incremental calculation of weighted mean and variance",
University of Cambridge Computing Service, February 2009.
<https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf>`_

"""
# last = stats before the increment
# new = the current increment
# updated = the aggregated stats
if sample_weight is None:
return _incremental_mean_and_var(X, last_mean, last_variance,
last_weight_sum)
nan_mask = np.isnan(X)
sample_weight_T = np.reshape(sample_weight, (1, -1))
# new_weight_sum with shape (n_features,)
new_weight_sum = np.dot(sample_weight_T,
~nan_mask).ravel().astype(np.float64)
total_weight_sum = _safe_accumulator_op(np.sum, sample_weight, axis=0)

X_0 = np.where(nan_mask, 0, X)
new_mean = np.average(X_0,
weights=sample_weight, axis=0).astype(np.float64)
new_mean *= total_weight_sum / new_weight_sum
updated_weight_sum = last_weight_sum + new_weight_sum
updated_mean = (
(last_weight_sum * last_mean + new_weight_sum * new_mean)
/ updated_weight_sum)

if last_variance is None:
updated_variance = None
else:
X_0 = np.where(nan_mask, 0, (X-new_mean)**2)
new_variance =\
_safe_accumulator_op(
np.average, X_0, weights=sample_weight, axis=0)
new_variance *= total_weight_sum / new_weight_sum
new_term = (
new_weight_sum *
(new_variance +
(new_mean - updated_mean) ** 2))
last_term = (
last_weight_sum *
(last_variance +
(last_mean - updated_mean) ** 2))
updated_variance = (new_term + last_term) / updated_weight_sum

return updated_mean, updated_variance, updated_weight_sum


def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
sample_weight=None):
"""Calculate mean update and a Youngs and Cramer variance update.

last_mean and last_variance are statistics computed at the last step by the
function. Both must be initialized to 0.0. In case no scaling is required
last_variance can be None. The mean is always required and returned because
necessary for the calculation of the variance. last_n_samples_seen is the
number of samples encountered until now.
If sample_weight is given, the weighted mean and variance is computed.

Update a given mean and (possibly) variance according to new data given
in X. last_mean is always required to compute the new mean.
If last_variance is None, no variance is computed and None return for
updated_variance.

From the paper "Algorithms for computing the sample variance: analysis and
recommendations", by Chan, Golub, and LeVeque.
Expand All @@ -811,13 +714,19 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
last_variance : array-like of shape (n_features,)

last_sample_count : array-like of shape (n_features,)
The number of samples encountered until now if sample_weight is None.
If sample_weight is not None, this is the sum of sample_weight
encountered.

sample_weight : array-like of shape (n_samples,) or None
Sample weights. If None, compute the unweighted mean/variance.

Returns
-------
updated_mean : ndarray of shape (n_features,)

updated_variance : ndarray of shape (n_features,)
If None, only mean is computed.
None if last_variance was None.

updated_sample_count : ndarray of shape (n_features,)

Expand All @@ -839,18 +748,28 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
# new = the current increment
# updated = the aggregated stats
last_sum = last_mean * last_sample_count
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
if sample_weight is not None:
new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None],
norbusan marked this conversation as resolved.
Show resolved Hide resolved
axis=0)
new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)),
norbusan marked this conversation as resolved.
Show resolved Hide resolved
axis=0)
else:
new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
new_sample_count = np.sum(~np.isnan(X), axis=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are really digging for performance, np.count_nonzero(~np.isnan(X), axis=0) can be much faster

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slightly better:

without above change
%timeit scale_with_partial_fit(X, sample_weight=None)
1.13 s ± 75.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

with above change
%timeit scale_with_partial_fit(X, sample_weight=None)
1.08 s ± 4.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we go with that?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeremiedbb Could you add this as suggested change so that the authorship is properly recorded. Thanks.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hum it's weird, when I first compared the timings I got a huge speed-up but when I try again I can't reproduce. Let's leave it as is and merge :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine with me, thanks for checking again


new_sample_count = np.sum(~np.isnan(X), axis=0)
updated_sample_count = last_sample_count + new_sample_count

updated_mean = (last_sum + new_sum) / updated_sample_count

if last_variance is None:
updated_variance = None
else:
new_unnormalized_variance = (
_safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
T = new_sum / new_sample_count
if sample_weight is not None:
new_unnormalized_variance = np.nansum(sample_weight[:, None] *
norbusan marked this conversation as resolved.
Show resolved Hide resolved
(X - T)**2, axis=0)
else:
new_unnormalized_variance = np.nansum((X - T)**2, axis=0)
last_unnormalized_variance = last_variance * last_sample_count

with np.errstate(divide='ignore', invalid='ignore'):
Expand Down
33 changes: 15 additions & 18 deletions 33 sklearn/utils/tests/test_extmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from sklearn.utils.extmath import log_logistic
from sklearn.utils.extmath import svd_flip
from sklearn.utils.extmath import _incremental_mean_and_var
from sklearn.utils.extmath import _incremental_weighted_mean_and_var
from sklearn.utils.extmath import _deterministic_vector_sign_flip
from sklearn.utils.extmath import softmax
from sklearn.utils.extmath import stable_cumsum
Expand Down Expand Up @@ -464,8 +463,8 @@ def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
mult = 10
X = rng.rand(1000, 20).astype(dtype)*mult
sample_weight = rng.rand(X.shape[0]) * mult
mean, var, _ = _incremental_weighted_mean_and_var(X, sample_weight,
0, 0, 0)
mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0,
sample_weight=sample_weight)

expected_mean = np.average(X, weights=sample_weight, axis=0)
expected_var = np.average(X**2, weights=sample_weight, axis=0) - \
Expand All @@ -488,11 +487,9 @@ def _assert(X, sample_weight, expected_mean, expected_var):
last_mean, last_weight_sum, last_var = 0, 0, 0
for batch in gen_batches(n, chunk_size):
last_mean, last_var, last_weight_sum = \
_incremental_weighted_mean_and_var(X[batch],
sample_weight[batch],
last_mean,
last_var,
last_weight_sum)
_incremental_mean_and_var(
X[batch], last_mean, last_var, last_weight_sum,
sample_weight=sample_weight[batch])
assert_allclose(last_mean, expected_mean)
assert_allclose(last_var, expected_var, atol=1e-6)

Expand Down Expand Up @@ -532,17 +529,17 @@ def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
[300, 300, 300, np.nan]]).astype(dtype)

X_means, X_variances, X_count = \
_incremental_weighted_mean_and_var(X,
sample_weights_X,
old_means,
old_variances,
old_weight_sum)
_incremental_mean_and_var(X,
old_means,
old_variances,
old_weight_sum,
sample_weight=sample_weights_X)
X_nan_means, X_nan_variances, X_nan_count = \
_incremental_weighted_mean_and_var(X_nan,
sample_weights_X_nan,
old_means,
old_variances,
old_weight_sum)
_incremental_mean_and_var(X_nan,
old_means,
old_variances,
old_weight_sum,
sample_weight=sample_weights_X_nan)

assert_allclose(X_nan_means, X_means)
assert_allclose(X_nan_variances, X_variances)
Expand Down
Morty Proxy This is a proxified and sanitized view of the page, visit original site.