48
48
assert_almost_equal ,
49
49
assert_array_almost_equal ,
50
50
assert_array_equal ,
51
+ assert_array_less ,
51
52
ignore_warnings ,
52
53
)
53
54
from sklearn .utils .fixes import COO_CONTAINERS , CSC_CONTAINERS , CSR_CONTAINERS
@@ -1304,55 +1305,78 @@ def test_enet_sample_weight_consistency(
1304
1305
1305
1306
@pytest .mark .parametrize ("fit_intercept" , [True , False ])
1306
1307
@pytest .mark .parametrize ("sparse_container" , [None ] + CSC_CONTAINERS )
1307
- def test_enet_cv_sample_weight_correctness (fit_intercept , sparse_container ):
1308
- """Test that ElasticNetCV with sample weights gives correct results."""
1309
- rng = np .random .RandomState (42 )
1310
- n_splits , n_samples , n_features = 3 , 10 , 5
1311
- X = rng .rand (n_splits * n_samples , n_features )
1308
+ def test_enet_cv_sample_weight_correctness (
1309
+ fit_intercept , sparse_container , global_random_seed
1310
+ ):
1311
+ """Test that ElasticNetCV with sample weights gives correct results.
1312
+
1313
+ We fit the same model twice, once with weighted training data, once with repeated
1314
+ data points in the training data and check that both models converge to the
1315
+ same solution.
1316
+
1317
+ Since this model uses an internal cross-validation scheme to tune the alpha
1318
+ regularization parameter, we make sure that the repetitions only occur within
1319
+ a specific CV group. Data points belonging to other CV groups stay
1320
+ unit-weighted / "unrepeated".
1321
+ """
1322
+ rng = np .random .RandomState (global_random_seed )
1323
+ n_splits , n_samples_per_cv , n_features = 3 , 10 , 5
1324
+ X_with_weights = rng .rand (n_splits * n_samples_per_cv , n_features )
1312
1325
beta = rng .rand (n_features )
1313
1326
beta [0 :2 ] = 0
1314
- y = X @ beta + rng .rand (n_splits * n_samples )
1315
- sw = np . ones_like ( y )
1327
+ y_with_weights = X_with_weights @ beta + rng .rand (n_splits * n_samples_per_cv )
1328
+
1316
1329
if sparse_container is not None :
1317
- X = sparse_container (X )
1330
+ X_with_weights = sparse_container (X_with_weights )
1318
1331
params = dict (tol = 1e-6 )
1319
1332
1320
- # Set alphas, otherwise the two cv models might use different ones.
1321
- if fit_intercept :
1322
- alphas = np .linspace (0.001 , 0.01 , num = 91 )
1323
- else :
1324
- alphas = np .linspace (0.01 , 0.1 , num = 91 )
1325
-
1326
- # We weight the first fold 2 times more.
1327
- sw [:n_samples ] = 2
1328
- groups_sw = np .r_ [
1329
- np .full (n_samples , 0 ), np .full (n_samples , 1 ), np .full (n_samples , 2 )
1330
- ]
1331
- splits_sw = list (LeaveOneGroupOut ().split (X , groups = groups_sw ))
1332
- reg_sw = ElasticNetCV (
1333
- alphas = alphas , cv = splits_sw , fit_intercept = fit_intercept , ** params
1333
+ # Assign random integer weights only to the first cross-validation group.
1334
+ # The samples in the other cross-validation groups are left with unit
1335
+ # weights.
1336
+
1337
+ sw = np .ones_like (y_with_weights )
1338
+ sw [:n_samples_per_cv ] = rng .randint (0 , 5 , size = n_samples_per_cv )
1339
+ groups_with_weights = np .concatenate (
1340
+ [
1341
+ np .full (n_samples_per_cv , 0 ),
1342
+ np .full (n_samples_per_cv , 1 ),
1343
+ np .full (n_samples_per_cv , 2 ),
1344
+ ]
1345
+ )
1346
+ splits_with_weights = list (
1347
+ LeaveOneGroupOut ().split (X_with_weights , groups = groups_with_weights )
1348
+ )
1349
+ reg_with_weights = ElasticNetCV (
1350
+ cv = splits_with_weights , fit_intercept = fit_intercept , ** params
1334
1351
)
1335
- reg_sw .fit (X , y , sample_weight = sw )
1336
1352
1337
- # We repeat the first fold 2 times and provide splits ourselves
1353
+ reg_with_weights .fit (X_with_weights , y_with_weights , sample_weight = sw )
1354
+
1338
1355
if sparse_container is not None :
1339
- X = X .toarray ()
1340
- X = np .r_ [ X [: n_samples ], X ]
1356
+ X_with_weights = X_with_weights .toarray ()
1357
+ X_with_repetitions = np .repeat ( X_with_weights , sw . astype ( int ), axis = 0 )
1341
1358
if sparse_container is not None :
1342
- X = sparse_container (X )
1343
- y = np .r_ [y [:n_samples ], y ]
1344
- groups = np .r_ [
1345
- np .full (2 * n_samples , 0 ), np .full (n_samples , 1 ), np .full (n_samples , 2 )
1346
- ]
1347
- splits = list (LeaveOneGroupOut ().split (X , groups = groups ))
1348
- reg = ElasticNetCV (alphas = alphas , cv = splits , fit_intercept = fit_intercept , ** params )
1349
- reg .fit (X , y )
1359
+ X_with_repetitions = sparse_container (X_with_repetitions )
1360
+
1361
+ y_with_repetitions = np .repeat (y_with_weights , sw .astype (int ), axis = 0 )
1362
+ groups_with_repetitions = np .repeat (groups_with_weights , sw .astype (int ), axis = 0 )
1363
+
1364
+ splits_with_repetitions = list (
1365
+ LeaveOneGroupOut ().split (X_with_repetitions , groups = groups_with_repetitions )
1366
+ )
1367
+ reg_with_repetitions = ElasticNetCV (
1368
+ cv = splits_with_repetitions , fit_intercept = fit_intercept , ** params
1369
+ )
1370
+ reg_with_repetitions .fit (X_with_repetitions , y_with_repetitions )
1350
1371
1351
- # ensure that we chose meaningful alphas, i.e. not boundaries
1352
- assert alphas [0 ] < reg .alpha_ < alphas [- 1 ]
1353
- assert reg_sw .alpha_ == reg .alpha_
1354
- assert_allclose (reg_sw .coef_ , reg .coef_ )
1355
- assert reg_sw .intercept_ == pytest .approx (reg .intercept_ )
1372
+ # Check that the alpha selection process is the same:
1373
+ assert_allclose (reg_with_weights .mse_path_ , reg_with_repetitions .mse_path_ )
1374
+ assert_allclose (reg_with_weights .alphas_ , reg_with_repetitions .alphas_ )
1375
+ assert reg_with_weights .alpha_ == pytest .approx (reg_with_repetitions .alpha_ )
1376
+
1377
+ # Check that the final model coefficients are the same:
1378
+ assert_allclose (reg_with_weights .coef_ , reg_with_repetitions .coef_ , atol = 1e-10 )
1379
+ assert reg_with_weights .intercept_ == pytest .approx (reg_with_repetitions .intercept_ )
1356
1380
1357
1381
1358
1382
@pytest .mark .parametrize ("sample_weight" , [False , True ])
@@ -1444,9 +1468,29 @@ def test_enet_cv_sample_weight_consistency(
1444
1468
assert_allclose (reg .intercept_ , intercept )
1445
1469
1446
1470
1471
+ @pytest .mark .parametrize ("X_is_sparse" , [False , True ])
1472
+ @pytest .mark .parametrize ("fit_intercept" , [False , True ])
1473
+ @pytest .mark .parametrize ("sample_weight" , [np .array ([10 , 1 , 10 , 1 ]), None ])
1474
+ def test_enet_alpha_max_sample_weight (X_is_sparse , fit_intercept , sample_weight ):
1475
+ X = np .array ([[3.0 , 1.0 ], [2.0 , 5.0 ], [5.0 , 3.0 ], [1.0 , 4.0 ]])
1476
+ beta = np .array ([1 , 1 ])
1477
+ y = X @ beta
1478
+ if X_is_sparse :
1479
+ X = sparse .csc_matrix (X )
1480
+ # Test alpha_max makes coefs zero.
1481
+ reg = ElasticNetCV (n_alphas = 1 , cv = 2 , eps = 1 , fit_intercept = fit_intercept )
1482
+ reg .fit (X , y , sample_weight = sample_weight )
1483
+ assert_allclose (reg .coef_ , 0 , atol = 1e-5 )
1484
+ alpha_max = reg .alpha_
1485
+ # Test smaller alpha makes coefs nonzero.
1486
+ reg = ElasticNet (alpha = 0.99 * alpha_max , fit_intercept = fit_intercept )
1487
+ reg .fit (X , y , sample_weight = sample_weight )
1488
+ assert_array_less (1e-3 , np .max (np .abs (reg .coef_ )))
1489
+
1490
+
1447
1491
@pytest .mark .parametrize ("estimator" , [ElasticNetCV , LassoCV ])
1448
1492
def test_linear_models_cv_fit_with_loky (estimator ):
1449
- # LinearModelsCV.fit performs inplace operations on fancy-indexed memmapped
1493
+ # LinearModelsCV.fit performs operations on fancy-indexed memmapped
1450
1494
# data when using the loky backend, causing an error due to unexpected
1451
1495
# behavior of fancy indexing of read-only memmaps (cf. numpy#14132).
1452
1496
0 commit comments