scikit-learn · Seladus · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -371,6 +371,11 @@ Changelog
  then `get_output_feature_names` is not defined.
  :pr:`21569` by :user:`Aurélien Geron <ageron>`.

+- |Enhancement| Added support for `sample_weight` in :class:`preprocessing.KBinsDiscretizer`.
+  This allows specifying the parameter weights for each sample to be used while 
+  fitting. The option is only available when `strategy` is set to `quantile`.
+  :pr:`22048` by :user:`Seladus <seladus>`.
+
 - |Fix| :class:`preprocessing.LabelBinarizer` now validates input parameters in
  `fit` instead of `__init__`.
  :pr:`21434` by :user:`Krum Arnaudov <krumeto>`.

diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -17,7 +17,9 @@
 from ..utils.validation import check_is_fitted
 from ..utils.validation import check_random_state
 from ..utils.validation import _check_feature_names_in
+from ..utils.validation import _check_sample_weight
 from ..utils.validation import check_scalar
+from ..utils.stats import _weighted_percentile
 from ..utils import _safe_indexing


@@ -171,7 +173,7 @@ def __init__(
        self.subsample = subsample
        self.random_state = random_state

-    def fit(self, X, y=None):
+    def fit(self, X, y=None, sample_weight=None):
        """
        Fit the estimator.

@@ -184,6 +186,9 @@ def fit(self, X, y=None):
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

+        sample_weight : ndarray of shape (n_samples,)
+            Contains weight values to be associated with each sample.
+
        Returns
        -------
        self : object
@@ -205,6 +210,13 @@ def fit(self, X, y=None):

        n_samples, n_features = X.shape

+        valid_strategy = ("uniform", "quantile", "kmeans")
+        if self.strategy not in valid_strategy:
+            raise ValueError(
+                f"Valid options for 'strategy' are {valid_strategy}. "
+                f"Got strategy={self.strategy!r} instead."
+            )
+
        if self.strategy == "quantile" and self.subsample is not None:
            if self.subsample == "warn":
                if n_samples > 2e5:
@@ -225,6 +237,7 @@ def fit(self, X, y=None):
                        n_samples, size=self.subsample, replace=False
                    )
                    X = _safe_indexing(X, subsample_idx)
+
        elif self.strategy != "quantile" and isinstance(
            self.subsample, numbers.Integral
        ):
@@ -233,23 +246,28 @@ def fit(self, X, y=None):
                '`subsample` must be used with `strategy="quantile"`.'
            )

+        elif self.strategy != "quantile" and sample_weight is not None:
+            raise ValueError(
+                "`sample_weight` was provided but it can be only used with"
+                f"strategy='quantile'. Got strategy={self.strategy!r} instead."
+            )
+
        valid_encode = ("onehot", "onehot-dense", "ordinal")
        if self.encode not in valid_encode:
            raise ValueError(
                "Valid options for 'encode' are {}. Got encode={!r} instead.".format(
                    valid_encode, self.encode
                )
            )
-        valid_strategy = ("uniform", "quantile", "kmeans")
-        if self.strategy not in valid_strategy:
-            raise ValueError(
-                "Valid options for 'strategy' are {}. "
-                "Got strategy={!r} instead.".format(valid_strategy, self.strategy)
-            )

        n_features = X.shape[1]
        n_bins = self._validate_n_bins(n_features)

+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(
+                sample_weight, X, dtype=X.dtype, copy=True
+            )
+
        bin_edges = np.zeros(n_features, dtype=object)
        for jj in range(n_features):
            column = X[:, jj]
@@ -268,8 +286,16 @@ def fit(self, X, y=None):

            elif self.strategy == "quantile":
                quantiles = np.linspace(0, 100, n_bins[jj] + 1)
-                bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
-
+                if sample_weight is None:
+                    bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+                else:
+                    bin_edges[jj] = np.asarray(
+                        [
+                            _weighted_percentile(column, sample_weight, q)
+                            for q in quantiles
+                        ],
+                        dtype=np.float64,
+                    )
            elif self.strategy == "kmeans":
                from ..cluster import KMeans  # fixes import loops


diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -16,16 +16,31 @@


 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
    [
-        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 2, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+            [1, 1, 1, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 1, 1],
+        ),
    ],
 )
-def test_fit_transform(strategy, expected):
+def test_fit_transform(strategy, expected, sample_weight):
    est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
-    est.fit(X)
+    est.fit(X, sample_weight=sample_weight)
    assert_array_equal(expected, est.transform(X))


@@ -53,6 +68,20 @@ def test_invalid_n_bins():
        est.fit_transform(X)


+def test_invalid_sample_weight():
+    # sample_weight parameter is used with wrong strategy (other than quantile)
+    strategy = ["uniform", "kmeans"]
+    sample_weight = [1, 1, 1, 1]
+    for s in strategy:
+        est = KBinsDiscretizer(n_bins=3, strategy=s)
+        err_msg = (
+            "`sample_weight` was provided but it can be only used with"
+            f"strategy='quantile'. Got strategy={s!r} instead."
+        )
+        with pytest.raises(ValueError, match=err_msg):
+            est.fit_transform(X, sample_weight=sample_weight)
+
+
 def test_invalid_n_bins_array():
    # Bad shape
    n_bins = np.full((2, 4), 2.0)
@@ -92,17 +121,40 @@ def test_invalid_n_bins_array():


 @pytest.mark.parametrize(
-    "strategy, expected",
+    "strategy, expected, sample_weight",
    [
-        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
-        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
-        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
+        ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
+        ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
+        ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            [1, 1, 3, 1],
+        ),
+        (
+            "quantile",
+            [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+            [0, 1, 3, 1],
+        ),
+        # (
+        #     "quantile",
+        #     [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+        #     [1, 1, 1, 1],
+        # ),
+        #
+        # TODO: This test case above aims to test if the case where an array of
+        #       ones passed in sample_weight parameter is equal to the case when
+        #       sample_weight is None.
+        #       Unfortunately, the behavior of `_weighted_percentile` when
+        #       `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
+        #       This problem has been adressed in issue :
+        #       https://github.com/scikit-learn/scikit-learn/issues/17370
    ],
 )
-def test_fit_transform_n_bins_array(strategy, expected):
+def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
    est = KBinsDiscretizer(
        n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
-    ).fit(X)
+    ).fit(X, sample_weight=sample_weight)
    assert_array_equal(expected, est.transform(X))

    # test the shape of bin_edges_
-Original file line number
+Diff line change
@@ -16,16 +16,31 @@
     @pytest.mark.parametrize(
-        "strategy, expected",
+        "strategy, expected, sample_weight",
             Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            glemaitre
  

      

      

      


        Dec 22, 2021


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      The test will need to be updated with the change that I asked earlier.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            Seladus
  

      

      

      


        Dec 23, 2021


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I updated the test cases that I previously pushed. Unfortunately, I'm doubting on their relevance. Would they be acceptable as is, or should I find more specific cases ?
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            glemaitre
  

      

      

      


        Jan 4, 2022


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I would like to have 2 specific checks that check the real behaviour of passing weights:

Check that passing None and an array of 1 will be equivalent.
Check that passing some zero weight would be equivalent to ignoring the sample and check the bins in these conditions.

    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            Seladus
  

      

      

      


        Jan 5, 2022


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Unfortunately, I fail to understand why, the case where sample_weight=None for n_bins=[2, 3, 3, 3] and the case where sample_weight=[1, 1, 1, 1] for n_bins=[2, 3, 3, 3] are not equivalent (in test method test_fit_transform_n_bins_array).
The behaviors of np.percentileand _weighted_percentile with sample_weight = [1, 1, 1, 1] are supposed to be equivalent, aren't they ?
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    



  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
  Member


      

  

  
    
      

      
            ogrisel
  

      

      

      


        Jan 5, 2022


      
  •

  
    
      
        
          edited
          
        
        
      
    
    
      
  
        
      Loading


  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  
    
  


    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      Indeed, I just checked:
>>> import numpy as np
>>> from sklearn.utils.stats import _weighted_percentile
>>> data = np.random.randn(100)
>>> np.percentile(data, [25, 50, 75])
array([-0.76701739,  0.0604021 ,  0.79485777])
>>> [_weighted_percentile(data, np.ones_like(data), p) for p in [25, 50, 75]]
[-0.7855144180141034, 0.05197426163774039, 0.7586492302901591]
It seems that this problem has been known for a while but not yet fixed:

_weighted_percentile does not lead to the same result than np.median #17370
Make _weighted_percentile more robust #6189

Old attempts to fix this problem or related problems:

[WIP] BUG make _weighted_percentile(data, ones, 50) consistent with numpy.median(data) #17377
ENH improve _weighted_percentile to provide several interpolation #17768

So for now, we can just comment out this case with a TODO comment with a link to #17370 for explain why this test is commented out.
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
  


          
          
        
      
    

    





  
        
  
    
        
    
  


      
          
  
      
            Copy link

  
      
    
  

  
      

  
    Contributor


      

  Author


  

  
    
      

      
            Seladus
  

      

      

      


        Jan 6, 2022


      
    

  


        
      
  
  
  
    

    There was a problem hiding this comment.


  

 
  
    

    Choose a reason for hiding this comment

    
      The reason will be displayed to describe this comment to others. Learn more.
    

    
      
      


  


  
    
      I commented out the faulty test case in 30edabc (l. 139-151)
    
  
  


    

        
      
  
  
    
  
  
  
    
    Sorry, something went wrong.
  

  
    
  
    
      

              Uh oh!

              
There was an error while loading. Please reload this page.


  
  


          
      
  
    
    
      
        
            
    All reactions
         [
-            ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]]),
-            ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]]),
-            ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]]),
+            ("uniform", [[0, 0, 0, 0], [1, 1, 1, 0], [2, 2, 2, 1], [2, 2, 2, 2]], None),
+            ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2]], None),
+            ("quantile", [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]], None),
+            (
+                "quantile",
+                [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+                [1, 1, 2, 1],
+            ),
+            (
+                "quantile",
+                [[0, 0, 0, 0], [1, 1, 1, 1], [2, 2, 2, 2], [2, 2, 2, 2]],
+                [1, 1, 1, 1],
+            ),
+            (
+                "quantile",
+                [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+                [0, 1, 1, 1],
+            ),
         ],
     )
-    def test_fit_transform(strategy, expected):
+    def test_fit_transform(strategy, expected, sample_weight):
         est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy=strategy)
-        est.fit(X)
+        est.fit(X, sample_weight=sample_weight)
         assert_array_equal(expected, est.transform(X))
@@ -53,6 +68,20 @@ def test_invalid_n_bins():
             est.fit_transform(X)
+    def test_invalid_sample_weight():
+        # sample_weight parameter is used with wrong strategy (other than quantile)
+        strategy = ["uniform", "kmeans"]
+        sample_weight = [1, 1, 1, 1]
+        for s in strategy:
+            est = KBinsDiscretizer(n_bins=3, strategy=s)
+            err_msg = (
+                "`sample_weight` was provided but it can be only used with"
+                f"strategy='quantile'. Got strategy={s!r} instead."
+            )
+            with pytest.raises(ValueError, match=err_msg):
+                est.fit_transform(X, sample_weight=sample_weight)
     def test_invalid_n_bins_array():
         # Bad shape
         n_bins = np.full((2, 4), 2.0)
@@ -92,17 +121,40 @@ def test_invalid_n_bins_array():
     @pytest.mark.parametrize(
-        "strategy, expected",
+        "strategy, expected, sample_weight",
         [
-            ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]]),
-            ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]]),
-            ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]]),
+            ("uniform", [[0, 0, 0, 0], [0, 1, 1, 0], [1, 2, 2, 1], [1, 2, 2, 2]], None),
+            ("kmeans", [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 2, 2, 2]], None),
+            ("quantile", [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]], None),
+            (
+                "quantile",
+                [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+                [1, 1, 3, 1],
+            ),
+            (
+                "quantile",
+                [[0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]],
+                [0, 1, 3, 1],
+            ),
+            # (
+            #     "quantile",
+            #     [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
+            #     [1, 1, 1, 1],
+            # ),
+            #
+            # TODO: This test case above aims to test if the case where an array of
+            #       ones passed in sample_weight parameter is equal to the case when
+            #       sample_weight is None.
+            #       Unfortunately, the behavior of `_weighted_percentile` when
+            #       `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
+            #       This problem has been adressed in issue :
+            #       https://github.com/scikit-learn/scikit-learn/issues/17370
         ],
     )
-    def test_fit_transform_n_bins_array(strategy, expected):
+    def test_fit_transform_n_bins_array(strategy, expected, sample_weight):
         est = KBinsDiscretizer(
             n_bins=[2, 3, 3, 3], encode="ordinal", strategy=strategy
-        ).fit(X)
+        ).fit(X, sample_weight=sample_weight)
         assert_array_equal(expected, est.transform(X))
         # test the shape of bin_edges_
-          Expand Down