@@ -690,113 +690,16 @@ def _safe_accumulator_op(op, x, *args, **kwargs):
690
690
return result
691
691
692
692
693
- def _incremental_weighted_mean_and_var (X , sample_weight ,
694
- last_mean ,
695
- last_variance ,
696
- last_weight_sum ):
697
- """Calculate weighted mean and weighted variance incremental update.
698
-
699
- .. versionadded:: 0.24
700
-
701
- Parameters
702
- ----------
703
- X : array-like of shape (n_samples, n_features)
704
- Data to use for mean and variance update.
705
-
706
- sample_weight : array-like of shape (n_samples,) or None
707
- Sample weights. If None, then samples are equally weighted.
708
-
709
- last_mean : array-like of shape (n_features,)
710
- Mean before the incremental update.
711
-
712
- last_variance : array-like of shape (n_features,) or None
713
- Variance before the incremental update.
714
- If None, variance update is not computed (in case scaling is not
715
- required).
716
-
717
- last_weight_sum : array-like of shape (n_features,)
718
- Sum of weights before the incremental update.
719
-
720
- Returns
721
- -------
722
- updated_mean : array of shape (n_features,)
723
-
724
- updated_variance : array of shape (n_features,) or None
725
- If None, only mean is computed.
726
-
727
- updated_weight_sum : array of shape (n_features,)
728
-
729
- Notes
730
- -----
731
- NaNs in `X` are ignored.
732
-
733
- `last_mean` and `last_variance` are statistics computed at the last step
734
- by the function. Both must be initialized to 0.0.
735
- The mean is always required (`last_mean`) and returned (`updated_mean`),
736
- whereas the variance can be None (`last_variance` and `updated_variance`).
737
-
738
- For further details on the algorithm to perform the computation in a
739
- numerically stable way, see [Finch2009]_, Sections 4 and 5.
740
-
741
- References
742
- ----------
743
- .. [Finch2009] `Tony Finch,
744
- "Incremental calculation of weighted mean and variance",
745
- University of Cambridge Computing Service, February 2009.
746
- <https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf>`_
747
-
748
- """
749
- # last = stats before the increment
750
- # new = the current increment
751
- # updated = the aggregated stats
752
- if sample_weight is None :
753
- return _incremental_mean_and_var (X , last_mean , last_variance ,
754
- last_weight_sum )
755
- nan_mask = np .isnan (X )
756
- sample_weight_T = np .reshape (sample_weight , (1 , - 1 ))
757
- # new_weight_sum with shape (n_features,)
758
- new_weight_sum = np .dot (sample_weight_T ,
759
- ~ nan_mask ).ravel ().astype (np .float64 )
760
- total_weight_sum = _safe_accumulator_op (np .sum , sample_weight , axis = 0 )
761
-
762
- X_0 = np .where (nan_mask , 0 , X )
763
- new_mean = np .average (X_0 ,
764
- weights = sample_weight , axis = 0 ).astype (np .float64 )
765
- new_mean *= total_weight_sum / new_weight_sum
766
- updated_weight_sum = last_weight_sum + new_weight_sum
767
- updated_mean = (
768
- (last_weight_sum * last_mean + new_weight_sum * new_mean )
769
- / updated_weight_sum )
770
-
771
- if last_variance is None :
772
- updated_variance = None
773
- else :
774
- X_0 = np .where (nan_mask , 0 , (X - new_mean )** 2 )
775
- new_variance = \
776
- _safe_accumulator_op (
777
- np .average , X_0 , weights = sample_weight , axis = 0 )
778
- new_variance *= total_weight_sum / new_weight_sum
779
- new_term = (
780
- new_weight_sum *
781
- (new_variance +
782
- (new_mean - updated_mean ) ** 2 ))
783
- last_term = (
784
- last_weight_sum *
785
- (last_variance +
786
- (last_mean - updated_mean ) ** 2 ))
787
- updated_variance = (new_term + last_term ) / updated_weight_sum
788
-
789
- return updated_mean , updated_variance , updated_weight_sum
790
-
791
-
792
- def _incremental_mean_and_var (X , last_mean , last_variance , last_sample_count ):
693
+ def _incremental_mean_and_var (X , last_mean , last_variance , last_sample_count ,
694
+ sample_weight = None ):
793
695
"""Calculate mean update and a Youngs and Cramer variance update.
794
696
795
- last_mean and last_variance are statistics computed at the last step by the
796
- function. Both must be initialized to 0.0. In case no scaling is required
797
- last_variance can be None. The mean is always required and returned because
798
- necessary for the calculation of the variance. last_n_samples_seen is the
799
- number of samples encountered until now.
697
+ If sample_weight is given, the weighted mean and variance is computed.
698
+
699
+ Update a given mean and (possibly) variance according to new data given
700
+ in X. last_mean is always required to compute the new mean.
701
+ If last_variance is None, no variance is computed and None return for
702
+ updated_variance.
800
703
801
704
From the paper "Algorithms for computing the sample variance: analysis and
802
705
recommendations", by Chan, Golub, and LeVeque.
@@ -811,13 +714,19 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
811
714
last_variance : array-like of shape (n_features,)
812
715
813
716
last_sample_count : array-like of shape (n_features,)
717
+ The number of samples encountered until now if sample_weight is None.
718
+ If sample_weight is not None, this is the sum of sample_weight
719
+ encountered.
720
+
721
+ sample_weight : array-like of shape (n_samples,) or None
722
+ Sample weights. If None, compute the unweighted mean/variance.
814
723
815
724
Returns
816
725
-------
817
726
updated_mean : ndarray of shape (n_features,)
818
727
819
728
updated_variance : ndarray of shape (n_features,)
820
- If None, only mean is computed .
729
+ None if last_variance was None .
821
730
822
731
updated_sample_count : ndarray of shape (n_features,)
823
732
@@ -839,18 +748,28 @@ def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
839
748
# new = the current increment
840
749
# updated = the aggregated stats
841
750
last_sum = last_mean * last_sample_count
842
- new_sum = _safe_accumulator_op (np .nansum , X , axis = 0 )
751
+ if sample_weight is not None :
752
+ new_sum = _safe_accumulator_op (np .nansum , X * sample_weight [:, None ],
753
+ axis = 0 )
754
+ new_sample_count = np .sum (sample_weight [:, None ] * (~ np .isnan (X )),
755
+ axis = 0 )
756
+ else :
757
+ new_sum = _safe_accumulator_op (np .nansum , X , axis = 0 )
758
+ new_sample_count = np .sum (~ np .isnan (X ), axis = 0 )
843
759
844
- new_sample_count = np .sum (~ np .isnan (X ), axis = 0 )
845
760
updated_sample_count = last_sample_count + new_sample_count
846
761
847
762
updated_mean = (last_sum + new_sum ) / updated_sample_count
848
763
849
764
if last_variance is None :
850
765
updated_variance = None
851
766
else :
852
- new_unnormalized_variance = (
853
- _safe_accumulator_op (np .nanvar , X , axis = 0 ) * new_sample_count )
767
+ T = new_sum / new_sample_count
768
+ if sample_weight is not None :
769
+ new_unnormalized_variance = np .nansum (sample_weight [:, None ] *
770
+ (X - T )** 2 , axis = 0 )
771
+ else :
772
+ new_unnormalized_variance = np .nansum ((X - T )** 2 , axis = 0 )
854
773
last_unnormalized_variance = last_variance * last_sample_count
855
774
856
775
with np .errstate (divide = 'ignore' , invalid = 'ignore' ):
0 commit comments