diff --git a/doc/release/upcoming_changes/28426.change.rst b/doc/release/upcoming_changes/28426.change.rst new file mode 100644 index 000000000000..d1c48640eed0 --- /dev/null +++ b/doc/release/upcoming_changes/28426.change.rst @@ -0,0 +1,6 @@ +Changes to automatic bin selection in numpy.histogram +----------------------------------------------------- +The automatic bin selection algorithm in ``numpy.histogram`` has been modified +to avoid out-of-memory errors for samples with low variation. +For full control over the selected bins the user can use set +the ``bin`` or ``range`` parameters of ``numpy.histogram``. diff --git a/numpy/lib/_histograms_impl.py b/numpy/lib/_histograms_impl.py index 643e5f35c85f..a19eb7f0e70a 100644 --- a/numpy/lib/_histograms_impl.py +++ b/numpy/lib/_histograms_impl.py @@ -228,9 +228,10 @@ def _hist_bin_fd(x, range): def _hist_bin_auto(x, range): """ - Histogram bin estimator that uses the minimum width of the - Freedman-Diaconis and Sturges estimators if the FD bin width is non-zero. - If the bin width from the FD estimator is 0, the Sturges estimator is used. + Histogram bin estimator that uses the minimum width of a relaxed + Freedman-Diaconis and Sturges estimators if the FD bin width does + not result in a large number of bins. The relaxed Freedman-Diaconis estimator + limits the bin width to half the sqrt estimated to avoid small bins. The FD estimator is usually the most robust method, but its width estimate tends to be too large for small `x` and bad for data with limited @@ -238,18 +239,13 @@ def _hist_bin_auto(x, range): and is the default in the R language. This method gives good off-the-shelf behaviour. - If there is limited variance the IQR can be 0, which results in the - FD bin width being 0 too. This is not a valid bin width, so - ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal. - If the IQR is 0, it's unlikely any variance-based estimators will be of - use, so we revert to the Sturges estimator, which only uses the size of the - dataset in its calculation. Parameters ---------- x : array_like Input data that is to be histogrammed, trimmed to range. May not be empty. + range : Tuple with range for the histogram Returns ------- @@ -261,12 +257,10 @@ def _hist_bin_auto(x, range): """ fd_bw = _hist_bin_fd(x, range) sturges_bw = _hist_bin_sturges(x, range) - del range # unused - if fd_bw: - return min(fd_bw, sturges_bw) - else: - # limited variance, so we return a len dependent bw estimator - return sturges_bw + sqrt_bw = _hist_bin_sqrt(x, range) + # heuristic to limit the maximal number of bins + fd_bw_corrected = max(fd_bw, sqrt_bw / 2) + return min(fd_bw_corrected, sturges_bw) # Private dict initialized at module load time diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py index 8d19d013a43c..42801e746283 100644 --- a/numpy/lib/tests/test_histograms.py +++ b/numpy/lib/tests/test_histograms.py @@ -416,6 +416,13 @@ def test_gh_23110(self): expected_hist = np.array([1, 0]) assert_array_equal(hist, expected_hist) + def test_gh_28400(self): + e = 1 + 1e-12 + Z = [0, 1, 1, 1, 1, 1, e, e, e, e, e, e, 2] + counts, edges = np.histogram(Z, bins="auto") + assert len(counts) < 10 + assert edges[0] == Z[0] + assert edges[-1] == Z[-1] class TestHistogramOptimBinNums: """ @@ -502,15 +509,16 @@ def test_novariance(self): def test_limited_variance(self): """ - Check when IQR is 0, but variance exists, we return the sturges value - and not the fd value. + Check when IQR is 0, but variance exists, we return a reasonable value. """ lim_var_data = np.ones(1000) lim_var_data[:3] = 0 lim_var_data[-4:] = 100 edges_auto = histogram_bin_edges(lim_var_data, 'auto') - assert_equal(edges_auto, np.linspace(0, 100, 12)) + assert_equal(edges_auto[0], 0) + assert_equal(edges_auto[-1], 100.) + assert len(edges_auto) < 100 edges_fd = histogram_bin_edges(lim_var_data, 'fd') assert_equal(edges_fd, np.array([0, 100]))