scikit-learn
diff --git a/‎sklearn/preprocessing/_data.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_data.py
+3-9Lines changed: 3 additions & 9 deletions b/‎sklearn/preprocessing/_data.py
Copy file name to clipboardExpand all lines: sklearn/preprocessing/_data.py
+3-9Lines changed: 3 additions & 9 deletions
diff --git a/‎sklearn/utils/extmath.py
Copy file name to clipboardExpand all lines: sklearn/utils/extmath.py
+29-110Lines changed: 29 additions & 110 deletions b/‎sklearn/utils/extmath.py
Copy file name to clipboardExpand all lines: sklearn/utils/extmath.py
+29-110Lines changed: 29 additions & 110 deletions
diff --git a/‎sklearn/utils/tests/test_extmath.py
Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_extmath.py
+15-18Lines changed: 15 additions & 18 deletions b/‎sklearn/utils/tests/test_extmath.py
Copy file name to clipboardExpand all lines: sklearn/utils/tests/test_extmath.py
+15-18Lines changed: 15 additions & 18 deletions
@@ -22,8 +22,7 @@
 from ..utils import check_array
 from ..utils.deprecation import deprecated
 from ..utils.extmath import row_norms
-from ..utils.extmath import (_incremental_mean_and_var,
-                             _incremental_weighted_mean_and_var)
+from ..utils.extmath import _incremental_mean_and_var
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale,
@@ -838,16 +837,11 @@ def partial_fit(self, X, y=None, sample_weight=None):
                 self.var_ = None
                 self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
 
-            elif sample_weight is not None:
-                self.mean_, self.var_, self.n_samples_seen_ = \
-                    _incremental_weighted_mean_and_var(X, sample_weight,
-                                                       self.mean_,
-                                                       self.var_,
-                                                       self.n_samples_seen_)
             else:
                 self.mean_, self.var_, self.n_samples_seen_ = \
                     _incremental_mean_and_var(X, self.mean_, self.var_,
-                                              self.n_samples_seen_)
+                                              self.n_samples_seen_,
+                                              sample_weight=sample_weight)
 
         # for backward-compatibility, reduce n_samples_seen_ to an integer
         # if the number of samples is the same for each feature (i.e. no
 
@@ -690,113 +690,16 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
     return result
 
 
-def _incremental_weighted_mean_and_var(X, sample_weight,
-                                       last_mean,
-                                       last_variance,
-                                       last_weight_sum):
-    """Calculate weighted mean and weighted variance incremental update.
-
-    .. versionadded:: 0.24
-
-    Parameters
-    ----------
-    X : array-like of shape (n_samples, n_features)
-        Data to use for mean and variance update.
-
-    sample_weight : array-like of shape (n_samples,) or None
-        Sample weights. If None, then samples are equally weighted.
-
-    last_mean : array-like of shape (n_features,)
-        Mean before the incremental update.
-
-    last_variance : array-like of shape (n_features,) or None
-        Variance before the incremental update.
-        If None, variance update is not computed (in case scaling is not
-        required).
-
-    last_weight_sum : array-like of shape (n_features,)
-        Sum of weights before the incremental update.
-
-    Returns
-    -------
-    updated_mean : array of shape (n_features,)
-
-    updated_variance : array of shape (n_features,) or None
-        If None, only mean is computed.
-
-    updated_weight_sum : array of shape (n_features,)
-
-    Notes
-    -----
-    NaNs in `X` are ignored.
-
-    `last_mean` and `last_variance` are statistics computed at the last step
-    by the function. Both must be initialized to 0.0.
-    The mean is always required (`last_mean`) and returned (`updated_mean`),
-    whereas the variance can be None (`last_variance` and `updated_variance`).
-
-    For further details on the algorithm to perform the computation in a
-    numerically stable way, see [Finch2009]_, Sections 4 and 5.
-
-    References
-    ----------
-    .. [Finch2009] `Tony Finch,
-       "Incremental calculation of weighted mean and variance",
-       University of Cambridge Computing Service, February 2009.
-       <https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf>`_
-
-    """
-    # last = stats before the increment
-    # new = the current increment
-    # updated = the aggregated stats
-    if sample_weight is None:
-        return _incremental_mean_and_var(X, last_mean, last_variance,
-                                         last_weight_sum)
-    nan_mask = np.isnan(X)
-    sample_weight_T = np.reshape(sample_weight, (1, -1))
-    # new_weight_sum with shape (n_features,)
-    new_weight_sum = np.dot(sample_weight_T,
-                            ~nan_mask).ravel().astype(np.float64)
-    total_weight_sum = _safe_accumulator_op(np.sum, sample_weight, axis=0)
-
-    X_0 = np.where(nan_mask, 0, X)
-    new_mean = np.average(X_0,
-                          weights=sample_weight, axis=0).astype(np.float64)
-    new_mean *= total_weight_sum / new_weight_sum
-    updated_weight_sum = last_weight_sum + new_weight_sum
-    updated_mean = (
-            (last_weight_sum * last_mean + new_weight_sum * new_mean)
-            / updated_weight_sum)
-
-    if last_variance is None:
-        updated_variance = None
-    else:
-        X_0 = np.where(nan_mask, 0, (X-new_mean)**2)
-        new_variance =\
-            _safe_accumulator_op(
-                np.average, X_0, weights=sample_weight, axis=0)
-        new_variance *= total_weight_sum / new_weight_sum
-        new_term = (
-                new_weight_sum *
-                (new_variance +
-                 (new_mean - updated_mean) ** 2))
-        last_term = (
-                last_weight_sum *
-                (last_variance +
-                 (last_mean - updated_mean) ** 2))
-        updated_variance = (new_term + last_term) / updated_weight_sum
-
-    return updated_mean, updated_variance, updated_weight_sum
-
-
-def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
+def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count,
+                              sample_weight=None):
     """Calculate mean update and a Youngs and Cramer variance update.
 
-    last_mean and last_variance are statistics computed at the last step by the
-    function. Both must be initialized to 0.0. In case no scaling is required
-    last_variance can be None. The mean is always required and returned because
-    necessary for the calculation of the variance. last_n_samples_seen is the
-    number of samples encountered until now.
+    If sample_weight is given, the weighted mean and variance is computed.
+
+    Update a given mean and (possibly) variance according to new data given
+    in X. last_mean is always required to compute the new mean.
+    If last_variance is None, no variance is computed and None return for
+    updated_variance.
 
     From the paper "Algorithms for computing the sample variance: analysis and
     recommendations", by Chan, Golub, and LeVeque.
@@ -811,13 +714,19 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     last_variance : array-like of shape (n_features,)
 
     last_sample_count : array-like of shape (n_features,)
+        The number of samples encountered until now if sample_weight is None.
+        If sample_weight is not None, this is the sum of sample_weight
+        encountered.
+
+    sample_weight : array-like of shape (n_samples,) or None
+        Sample weights. If None, compute the unweighted mean/variance.
 
     Returns
     -------
     updated_mean : ndarray of shape (n_features,)
 
     updated_variance : ndarray of shape (n_features,)
-        If None, only mean is computed.
+        None if last_variance was None.
 
     updated_sample_count : ndarray of shape (n_features,)
 
@@ -839,18 +748,28 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
-    new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
+    if sample_weight is not None:
+        new_sum = _safe_accumulator_op(np.nansum, X * sample_weight[:, None],
+                                       axis=0)
+        new_sample_count = np.sum(sample_weight[:, None] * (~np.isnan(X)),
+                                  axis=0)
+    else:
+        new_sum = _safe_accumulator_op(np.nansum, X, axis=0)
+        new_sample_count = np.sum(~np.isnan(X), axis=0)
 
-    new_sample_count = np.sum(~np.isnan(X), axis=0)
     updated_sample_count = last_sample_count + new_sample_count
 
     updated_mean = (last_sum + new_sum) / updated_sample_count
 
     if last_variance is None:
         updated_variance = None
     else:
-        new_unnormalized_variance = (
-            _safe_accumulator_op(np.nanvar, X, axis=0) * new_sample_count)
+        T = new_sum / new_sample_count
+        if sample_weight is not None:
+            new_unnormalized_variance = np.nansum(sample_weight[:, None] *
+                                                  (X - T)**2, axis=0)
+        else:
+            new_unnormalized_variance = np.nansum((X - T)**2, axis=0)
         last_unnormalized_variance = last_variance * last_sample_count
 
         with np.errstate(divide='ignore', invalid='ignore'):
 
@@ -30,7 +30,6 @@
 from sklearn.utils.extmath import log_logistic
 from sklearn.utils.extmath import svd_flip
 from sklearn.utils.extmath import _incremental_mean_and_var
-from sklearn.utils.extmath import _incremental_weighted_mean_and_var
 from sklearn.utils.extmath import _deterministic_vector_sign_flip
 from sklearn.utils.extmath import softmax
 from sklearn.utils.extmath import stable_cumsum
@@ -464,8 +463,8 @@ def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
     mult = 10
     X = rng.rand(1000, 20).astype(dtype)*mult
     sample_weight = rng.rand(X.shape[0]) * mult
-    mean, var, _ = _incremental_weighted_mean_and_var(X, sample_weight,
-                                                      0, 0, 0)
+    mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0,
+                                             sample_weight=sample_weight)
 
     expected_mean = np.average(X, weights=sample_weight, axis=0)
     expected_var = np.average(X**2, weights=sample_weight, axis=0) - \
@@ -488,11 +487,9 @@ def _assert(X, sample_weight, expected_mean, expected_var):
             last_mean, last_weight_sum, last_var = 0, 0, 0
             for batch in gen_batches(n, chunk_size):
                 last_mean, last_var, last_weight_sum = \
-                    _incremental_weighted_mean_and_var(X[batch],
-                                                       sample_weight[batch],
-                                                       last_mean,
-                                                       last_var,
-                                                       last_weight_sum)
+                    _incremental_mean_and_var(
+                        X[batch], last_mean, last_var, last_weight_sum,
+                        sample_weight=sample_weight[batch])
             assert_allclose(last_mean, expected_mean)
             assert_allclose(last_var, expected_var, atol=1e-6)
 
@@ -532,17 +529,17 @@ def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
                       [300, 300, 300, np.nan]]).astype(dtype)
 
     X_means, X_variances, X_count = \
-        _incremental_weighted_mean_and_var(X,
-                                           sample_weights_X,
-                                           old_means,
-                                           old_variances,
-                                           old_weight_sum)
+        _incremental_mean_and_var(X,
+                                  old_means,
+                                  old_variances,
+                                  old_weight_sum,
+                                  sample_weight=sample_weights_X)
     X_nan_means, X_nan_variances, X_nan_count = \
-        _incremental_weighted_mean_and_var(X_nan,
-                                           sample_weights_X_nan,
-                                           old_means,
-                                           old_variances,
-                                           old_weight_sum)
+        _incremental_mean_and_var(X_nan,
+                                  old_means,
+                                  old_variances,
+                                  old_weight_sum,
+                                  sample_weight=sample_weights_X_nan)
 
     assert_allclose(X_nan_means, X_means)
     assert_allclose(X_nan_variances, X_variances)