scikit-learn
diff --git a/‎sklearn/naive_bayes.py
Copy file name to clipboardExpand all lines: sklearn/naive_bayes.py
+60-32Lines changed: 60 additions & 32 deletions b/‎sklearn/naive_bayes.py
Copy file name to clipboardExpand all lines: sklearn/naive_bayes.py
+60-32Lines changed: 60 additions & 32 deletions
diff --git a/‎sklearn/tests/test_common.py
Copy file name to clipboardExpand all lines: sklearn/tests/test_common.py
-1Lines changed: 0 additions & 1 deletion b/‎sklearn/tests/test_common.py
Copy file name to clipboardExpand all lines: sklearn/tests/test_common.py
-1Lines changed: 0 additions & 1 deletion
diff --git a/‎sklearn/tests/test_naive_bayes.py
Copy file name to clipboardExpand all lines: sklearn/tests/test_naive_bayes.py
+31-44Lines changed: 31 additions & 44 deletions b/‎sklearn/tests/test_naive_bayes.py
Copy file name to clipboardExpand all lines: sklearn/tests/test_naive_bayes.py
+31-44Lines changed: 31 additions & 44 deletions
@@ -27,10 +27,10 @@
 from .preprocessing import binarize
 from .preprocessing import LabelBinarizer
 from .preprocessing import label_binarize
-from .utils import check_X_y, check_array, deprecated
+from .utils import deprecated
 from .utils.extmath import safe_sparse_dot
 from .utils.multiclass import _check_partial_fit_first_call
-from .utils.validation import check_is_fitted, check_non_negative, column_or_1d
+from .utils.validation import check_is_fitted, check_non_negative
 from .utils.validation import _check_sample_weight
 from .utils.validation import _deprecate_positional_args
 
@@ -55,7 +55,10 @@ def _joint_log_likelihood(self, X):
 
     @abstractmethod
     def _check_X(self, X):
-        """To be overridden in subclasses with the actual checks."""
+        """To be overridden in subclasses with the actual checks.
+
+        Only used in predict* methods.
+        """
 
     def predict(self, X):
         """
@@ -214,12 +217,12 @@ def fit(self, X, y, sample_weight=None):
         self : object
         """
         X, y = self._validate_data(X, y)
-        y = column_or_1d(y, warn=True)
         return self._partial_fit(X, y, np.unique(y), _refit=True,
                                  sample_weight=sample_weight)
 
     def _check_X(self, X):
-        return check_array(X)
+        """Validate X, used only in predict* methods."""
+        return self._validate_data(X, reset=False)
 
     @staticmethod
     def _update_mean_variance(n_past, mu, var, X, sample_weight=None):
@@ -367,7 +370,11 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
         -------
         self : object
         """
-        X, y = check_X_y(X, y)
+        if _refit:
+            self.classes_ = None
+
+        first_call = _check_partial_fit_first_call(self, classes)
+        X, y = self._validate_data(X, y, reset=first_call)
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X)
 
@@ -377,10 +384,7 @@ def _partial_fit(self, X, y, classes=None, _refit=False,
         # deviation of the largest dimension.
         self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()
 
-        if _refit:
-            self.classes_ = None
-
-        if _check_partial_fit_first_call(self, classes):
+        if first_call:
             # This is the first call to partial_fit:
             # initialize various cumulative counters
             n_features = X.shape[1]
@@ -488,10 +492,12 @@ class _BaseDiscreteNB(_BaseNB):
     """
 
     def _check_X(self, X):
-        return check_array(X, accept_sparse='csr')
+        """Validate X, used only in predict* methods."""
+        return self._validate_data(X, accept_sparse='csr', reset=False)
 
-    def _check_X_y(self, X, y):
-        return self._validate_data(X, y, accept_sparse='csr')
+    def _check_X_y(self, X, y, reset=True):
+        """Validate X and y in fit methods."""
+        return self._validate_data(X, y, accept_sparse='csr', reset=reset)
 
     def _update_class_log_prior(self, class_prior=None):
         n_classes = len(self.classes_)
@@ -518,7 +524,7 @@ def _check_alpha(self):
             raise ValueError('Smoothing parameter alpha = %.1e. '
                              'alpha should be > 0.' % np.min(self.alpha))
         if isinstance(self.alpha, np.ndarray):
-            if not self.alpha.shape[0] == self.n_features_:
+            if not self.alpha.shape[0] == self.n_features_in_:
                 raise ValueError("alpha should be a scalar or a numpy array "
                                  "with shape [n_features]")
         if np.min(self.alpha) < _ALPHA_MIN:
@@ -563,18 +569,15 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
         -------
         self : object
         """
-        X, y = self._check_X_y(X, y)
+        first_call = not hasattr(self, "classes_")
+        X, y = self._check_X_y(X, y, reset=first_call)
         _, n_features = X.shape
 
         if _check_partial_fit_first_call(self, classes):
             # This is the first call to partial_fit:
             # initialize various cumulative counters
             n_classes = len(classes)
             self._init_counters(n_classes, n_features)
-            self.n_features_ = n_features
-        elif n_features != self.n_features_:
-            msg = "Number of features %d does not match previous data %d."
-            raise ValueError(msg % (n_features, self.n_features_))
 
         Y = label_binarize(y, classes=self.classes_)
         if Y.shape[1] == 1:
@@ -631,7 +634,6 @@ def fit(self, X, y, sample_weight=None):
         """
         X, y = self._check_X_y(X, y)
         _, n_features = X.shape
-        self.n_features_ = n_features
 
         labelbin = LabelBinarizer()
         Y = labelbin.fit_transform(y)
@@ -687,6 +689,16 @@ def intercept_(self):
     def _more_tags(self):
         return {'poor_score': True}
 
+    # TODO: Remove in 1.2
+    # mypy error: Decorated property not supported
+    @deprecated(  # type: ignore
+        "Attribute n_features_ was deprecated in version 1.0 and will be "
+        "removed in 1.2. Use 'n_features_in_' instead."
+    )
+    @property
+    def n_features_(self):
+        return self.n_features_in_
+
 
 class MultinomialNB(_BaseDiscreteNB):
     """
@@ -753,6 +765,10 @@ class MultinomialNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -879,6 +895,10 @@ class ComplementNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -996,6 +1016,10 @@ class BernoulliNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     Examples
     --------
     >>> import numpy as np
@@ -1032,13 +1056,14 @@ def __init__(self, *, alpha=1.0, binarize=.0, fit_prior=True,
         self.class_prior = class_prior
 
     def _check_X(self, X):
+        """Validate X, used only in predict* methods."""
         X = super()._check_X(X)
         if self.binarize is not None:
             X = binarize(X, threshold=self.binarize)
         return X
 
-    def _check_X_y(self, X, y):
-        X, y = super()._check_X_y(X, y)
+    def _check_X_y(self, X, y, reset=True):
+        X, y = super()._check_X_y(X, y, reset=reset)
         if self.binarize is not None:
             X = binarize(X, threshold=self.binarize)
         return X, y
@@ -1133,6 +1158,10 @@ class CategoricalNB(_BaseDiscreteNB):
     n_features_ : int
         Number of features of each sample.
 
+        .. deprecated:: 1.0
+            Attribute `n_features_` was deprecated in version 1.0 and will be
+            removed in 1.2. Use `n_features_in_` instead.
+
     n_categories_ : ndarray of shape (n_features,), dtype=np.int64
         Number of categories for each feature. This value is
         inferred from the data or set by the minimum number of categories.
@@ -1235,14 +1264,15 @@ def _more_tags(self):
         return {'requires_positive_X': True}
 
     def _check_X(self, X):
-        X = check_array(X, dtype='int', accept_sparse=False,
-                        force_all_finite=True)
+        """Validate X, used only in predict* methods."""
+        X = self._validate_data(X, dtype='int', accept_sparse=False,
+                                force_all_finite=True, reset=False)
         check_non_negative(X, "CategoricalNB (input X)")
         return X
 
-    def _check_X_y(self, X, y):
+    def _check_X_y(self, X, y, reset=True):
         X, y = self._validate_data(X, y, dtype='int', accept_sparse=False,
-                                   force_all_finite=True)
+                                   force_all_finite=True, reset=reset)
         check_non_negative(X, "CategoricalNB (input X)")
         return X, y
 
@@ -1297,7 +1327,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes):
         self.class_count_ += Y.sum(axis=0)
         self.n_categories_ = self._validate_n_categories(
             X, self.min_categories)
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             X_feature = X[:, i]
             self.category_count_[i] = _update_cat_count_dims(
                 self.category_count_[i], self.n_categories_[i] - 1)
@@ -1307,7 +1337,7 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes):
 
     def _update_feature_log_prob(self, alpha):
         feature_log_prob = []
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             smoothed_cat_count = self.category_count_[i] + alpha
             smoothed_class_count = smoothed_cat_count.sum(axis=1)
             feature_log_prob.append(
@@ -1316,11 +1346,9 @@ def _update_feature_log_prob(self, alpha):
         self.feature_log_prob_ = feature_log_prob
 
     def _joint_log_likelihood(self, X):
-        if not X.shape[1] == self.n_features_:
-            raise ValueError("Expected input with %d features, got %d instead"
-                             % (self.n_features_, X.shape[1]))
+        self._check_n_features(X, reset=False)
         jll = np.zeros((X.shape[0], self.class_count_.shape[0]))
-        for i in range(self.n_features_):
+        for i in range(self.n_features_in_):
             indices = X[:, i]
             jll += self.feature_log_prob_[i][:, indices].T
         total_ll = jll + self.class_log_prior_
 
@@ -273,7 +273,6 @@ def test_search_cv(estimator, check, request):
     'model_selection',
     'multiclass',
     'multioutput',
-    'naive_bayes',
     'pipeline',
     'random_projection',
 }
 
@@ -57,7 +57,11 @@ def test_gnb():
     # Test whether label mismatch between target y and classes raises
     # an Error
     # FIXME Remove this test once the more general partial_fit tests are merged
-    assert_raises(ValueError, GaussianNB().partial_fit, X, y, classes=[0, 1])
+    with pytest.raises(
+        ValueError,
+        match="The target label.* in y do not exist in the initial classes"
+    ):
+        GaussianNB().partial_fit(X, y, classes=[0, 1])
 
 
 # TODO remove in 1.2 once sigma_ attribute is removed (GH #18842)
@@ -74,7 +78,7 @@ def test_gnb_prior():
     clf = GaussianNB().fit(X, y)
     assert_array_almost_equal(np.array([3, 3]) / 6.0,
                               clf.class_prior_, 8)
-    clf.fit(X1, y1)
+    clf = GaussianNB().fit(X1, y1)
     # Check that the class priors sum to 1
     assert_array_almost_equal(clf.class_prior_.sum(), 1)
 
@@ -171,16 +175,6 @@ def test_gnb_check_update_with_no_data():
     assert tvar == var
 
 
-def test_gnb_pfit_wrong_nb_features():
-    """Test whether an error is raised when the number of feature changes
-    between two partial fit"""
-    clf = GaussianNB()
-    # Fit for the first time the GNB
-    clf.fit(X, y)
-    # Partial fit a second time with an incoherent X
-    assert_raises(ValueError, clf.partial_fit, np.hstack((X, X)), y)
-
-
 def test_gnb_partial_fit():
     clf = GaussianNB().fit(X, y)
     clf_pf = GaussianNB().partial_fit(X, y, np.unique(y))
@@ -272,37 +266,22 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes):
 
 
 @pytest.mark.parametrize('NaiveBayes', ALL_NAIVE_BAYES_CLASSES)
-def test_naive_bayes_input_check_fit(NaiveBayes):
-    # Test input checks for the fit method
-
-    # check shape consistency for number of samples at fit time
-    assert_raises(ValueError, NaiveBayes().fit, X2, y2[:-1])
-
-    # check shape consistency for number of input features at predict time
-    clf = NaiveBayes().fit(X2, y2)
-    assert_raises(ValueError, clf.predict, X2[:, :-1])
-
-
-@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES)
-def test_discretenb_input_check_partial_fit(DiscreteNaiveBayes):
-    # check shape consistency
-    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2[:-1],
-                  classes=np.unique(y2))
-
+def test_NB_partial_fit_no_first_classes(NaiveBayes):
     # classes is required for first call to partial fit
-    assert_raises(ValueError, DiscreteNaiveBayes().partial_fit, X2, y2)
+    with pytest.raises(
+        ValueError,
+        match="classes must be passed on the first call to partial_fit."
+    ):
+        NaiveBayes().partial_fit(X2, y2)
 
     # check consistency of consecutive classes values
-    clf = DiscreteNaiveBayes()
+    clf = NaiveBayes()
     clf.partial_fit(X2, y2, classes=np.unique(y2))
-    assert_raises(ValueError, clf.partial_fit, X2, y2,
-                  classes=np.arange(42))
-
-    # check consistency of input shape for partial_fit
-    assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2)
-
-    # check consistency of input shape for predict
-    assert_raises(ValueError, clf.predict, X2[:, :-1])
+    with pytest.raises(
+        ValueError,
+        match="is not the same as on last call to partial_fit"
+    ):
+        clf.partial_fit(X2, y2, classes=np.arange(42))
 
 
 # TODO: Remove in version 1.1
@@ -725,11 +704,6 @@ def test_categoricalnb():
     assert_raise_message(ValueError, error_msg, clf.predict, X)
     assert_raise_message(ValueError, error_msg, clf.fit, X, y)
 
-    # Check error is raised for incorrect X
-    X = np.array([[1, 4, 1], [2, 5, 6]])
-    msg = "Expected input with 2 features, got 3 instead"
-    assert_raise_message(ValueError, msg, clf.predict, X)
-
     # Test alpha
     X3_test = np.array([[2, 5]])
     # alpha=1 increases the count of all categories by one so the final
@@ -941,3 +915,16 @@ def test_check_accuracy_on_digits():
 
     scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10)
     assert scores.mean() > 0.86
+
+
+# FIXME: remove in 1.2
+@pytest.mark.parametrize("Estimator", DISCRETE_NAIVE_BAYES_CLASSES)
+def test_n_features_deprecation(Estimator):
+    # Check that we raise the proper deprecation warning if accessing
+    # `n_features_`.
+    X = np.array([[1, 2], [3, 4]])
+    y = np.array([1, 0])
+    est = Estimator().fit(X, y)
+
+    with pytest.warns(FutureWarning, match="n_features_ was deprecated"):
+        est.n_features_
Original file line number	Diff line number	Diff line change
`@@ -273,7 +273,6 @@ def test_search_cv(estimator, check, request):`
`273`	`273`	`'model_selection',`
`274`	`274`	`'multiclass',`
`275`	`275`	`'multioutput',`
`276`		`- 'naive_bayes',`
`277`	`276`	`'pipeline',`
`278`	`277`	`'random_projection',`
`279`	`278`	`}`