10
10
assert_array_almost_equal ,
11
11
assert_array_equal ,
12
12
assert_allclose_dense_sparse ,
13
+ assert_allclose ,
13
14
)
14
15
15
16
X = [[- 2 , 1.5 , - 4 , - 1 ], [- 1 , 2.5 , - 3 , - 0.5 ], [0 , 3.5 , - 2 , 0.5 ], [1 , 4.5 , - 1 , 2 ]]
16
17
17
18
18
19
@pytest .mark .parametrize (
19
- "strategy, expected" ,
20
+ "strategy, expected, sample_weight " ,
20
21
[
21
- ("uniform" , [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 0 ], [2 , 2 , 2 , 1 ], [2 , 2 , 2 , 2 ]]),
22
- ("kmeans" , [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ]]),
23
- ("quantile" , [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ], [2 , 2 , 2 , 2 ]]),
22
+ ("uniform" , [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 0 ], [2 , 2 , 2 , 1 ], [2 , 2 , 2 , 2 ]], None ),
23
+ ("kmeans" , [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ]], None ),
24
+ ("quantile" , [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ], [2 , 2 , 2 , 2 ]], None ),
25
+ (
26
+ "quantile" ,
27
+ [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ], [2 , 2 , 2 , 2 ]],
28
+ [1 , 1 , 2 , 1 ],
29
+ ),
30
+ (
31
+ "quantile" ,
32
+ [[0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [2 , 2 , 2 , 2 ], [2 , 2 , 2 , 2 ]],
33
+ [1 , 1 , 1 , 1 ],
34
+ ),
35
+ (
36
+ "quantile" ,
37
+ [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [1 , 1 , 1 , 1 ]],
38
+ [0 , 1 , 1 , 1 ],
39
+ ),
24
40
],
25
41
)
26
- def test_fit_transform (strategy , expected ):
42
+ def test_fit_transform (strategy , expected , sample_weight ):
27
43
est = KBinsDiscretizer (n_bins = 3 , encode = "ordinal" , strategy = strategy )
28
- est .fit (X )
44
+ est .fit (X , sample_weight = sample_weight )
29
45
assert_array_equal (expected , est .transform (X ))
30
46
31
47
@@ -35,6 +51,18 @@ def test_valid_n_bins():
35
51
assert KBinsDiscretizer (n_bins = 2 ).fit (X ).n_bins_ .dtype == np .dtype (int )
36
52
37
53
54
+ @pytest .mark .parametrize ("strategy" , ["uniform" , "kmeans" ])
55
+ def test_kbinsdiscretizer_wrong_strategy_with_weights (strategy ):
56
+ """Check that we raise an error when the wrong strategy is used."""
57
+ sample_weight = np .ones (shape = (len (X )))
58
+ est = KBinsDiscretizer (n_bins = 3 , strategy = strategy )
59
+ err_msg = (
60
+ "`sample_weight` was provided but it can only be used with strategy='quantile'."
61
+ )
62
+ with pytest .raises (ValueError , match = err_msg ):
63
+ est .fit (X , sample_weight = sample_weight )
64
+
65
+
38
66
def test_invalid_n_bins_array ():
39
67
# Bad shape
40
68
n_bins = np .full ((2 , 4 ), 2.0 )
@@ -74,17 +102,40 @@ def test_invalid_n_bins_array():
74
102
75
103
76
104
@pytest .mark .parametrize (
77
- "strategy, expected" ,
105
+ "strategy, expected, sample_weight " ,
78
106
[
79
- ("uniform" , [[0 , 0 , 0 , 0 ], [0 , 1 , 1 , 0 ], [1 , 2 , 2 , 1 ], [1 , 2 , 2 , 2 ]]),
80
- ("kmeans" , [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [1 , 2 , 2 , 2 ]]),
81
- ("quantile" , [[0 , 0 , 0 , 0 ], [0 , 1 , 1 , 1 ], [1 , 2 , 2 , 2 ], [1 , 2 , 2 , 2 ]]),
107
+ ("uniform" , [[0 , 0 , 0 , 0 ], [0 , 1 , 1 , 0 ], [1 , 2 , 2 , 1 ], [1 , 2 , 2 , 2 ]], None ),
108
+ ("kmeans" , [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [1 , 2 , 2 , 2 ]], None ),
109
+ ("quantile" , [[0 , 0 , 0 , 0 ], [0 , 1 , 1 , 1 ], [1 , 2 , 2 , 2 ], [1 , 2 , 2 , 2 ]], None ),
110
+ (
111
+ "quantile" ,
112
+ [[0 , 0 , 0 , 0 ], [0 , 1 , 1 , 1 ], [1 , 2 , 2 , 2 ], [1 , 2 , 2 , 2 ]],
113
+ [1 , 1 , 3 , 1 ],
114
+ ),
115
+ (
116
+ "quantile" ,
117
+ [[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ], [1 , 1 , 1 , 1 ], [1 , 1 , 1 , 1 ]],
118
+ [0 , 1 , 3 , 1 ],
119
+ ),
120
+ # (
121
+ # "quantile",
122
+ # [[0, 0, 0, 0], [0, 1, 1, 1], [1, 2, 2, 2], [1, 2, 2, 2]],
123
+ # [1, 1, 1, 1],
124
+ # ),
125
+ #
126
+ # TODO: This test case above aims to test if the case where an array of
127
+ # ones passed in sample_weight parameter is equal to the case when
128
+ # sample_weight is None.
129
+ # Unfortunately, the behavior of `_weighted_percentile` when
130
+ # `sample_weight = [1, 1, 1, 1]` are currently not equivalent.
131
+ # This problem has been adressed in issue :
132
+ # https://github.com/scikit-learn/scikit-learn/issues/17370
82
133
],
83
134
)
84
- def test_fit_transform_n_bins_array (strategy , expected ):
135
+ def test_fit_transform_n_bins_array (strategy , expected , sample_weight ):
85
136
est = KBinsDiscretizer (
86
137
n_bins = [2 , 3 , 3 , 3 ], encode = "ordinal" , strategy = strategy
87
- ).fit (X )
138
+ ).fit (X , sample_weight = sample_weight )
88
139
assert_array_equal (expected , est .transform (X ))
89
140
90
141
# test the shape of bin_edges_
@@ -94,6 +145,27 @@ def test_fit_transform_n_bins_array(strategy, expected):
94
145
assert bin_edges .shape == (n_bins + 1 ,)
95
146
96
147
148
+ @pytest .mark .filterwarnings ("ignore: Bins whose width are too small" )
149
+ def test_kbinsdiscretizer_effect_sample_weight ():
150
+ """Check the impact of `sample_weight` one computed quantiles."""
151
+ X = np .array ([[- 2 ], [- 1 ], [1 ], [3 ], [500 ], [1000 ]])
152
+ # add a large number of bins such that each sample with a non-null weight
153
+ # will be used as bin edge
154
+ est = KBinsDiscretizer (n_bins = 10 , encode = "ordinal" , strategy = "quantile" )
155
+ est .fit (X , sample_weight = [1 , 1 , 1 , 1 , 0 , 0 ])
156
+ assert_allclose (est .bin_edges_ [0 ], [- 2 , - 1 , 1 , 3 ])
157
+ assert_allclose (est .transform (X ), [[0.0 ], [1.0 ], [2.0 ], [2.0 ], [2.0 ], [2.0 ]])
158
+
159
+
160
+ def test_kbinsdiscretizer_no_mutating_sample_weight ():
161
+ """Make sure that `sample_weight` is not changed in place."""
162
+ est = KBinsDiscretizer (n_bins = 3 , encode = "ordinal" , strategy = "quantile" )
163
+ sample_weight = np .array ([1 , 3 , 1 , 2 ], dtype = np .float64 )
164
+ sample_weight_copy = np .copy (sample_weight )
165
+ est .fit (X , sample_weight = sample_weight )
166
+ assert_allclose (sample_weight , sample_weight_copy )
167
+
168
+
97
169
@pytest .mark .parametrize ("strategy" , ["uniform" , "kmeans" , "quantile" ])
98
170
def test_same_min_max (strategy ):
99
171
warnings .simplefilter ("always" )
0 commit comments