From 28162b1e3e2b6a1e941cadd8bb6a23be701e8aa3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 24 Jul 2023 22:14:22 +0200 Subject: [PATCH 01/40] ENH add _weights_are_valid --- numpy/lib/function_base.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 22371a0384b5..f4df88604e15 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -390,6 +390,32 @@ def iterable(y): return True +def _weights_are_valid(weights, a, axis): + """Validate weights array. + + We assume, weights is not None. + """ + wgt = np.asanyarray(weights) + + # Sanity checks + if a.shape != wgt.shape: + if axis is None: + raise TypeError( + "Axis must be specified when shapes of a and weights " + "differ.") + if wgt.ndim != 1: + raise TypeError( + "1D weights expected when shapes of a and weights differ.") + if wgt.shape[0] != a.shape[axis]: + raise ValueError( + "Length of weights not compatible with specified axis.") + + # setup wgt to broadcast along axis + wgt = np.broadcast_to(wgt, (a.ndim-1)*(1,) + wgt.shape) + wgt = wgt.swapaxes(-1, axis) + return wgt + + def _average_dispatcher(a, axis=None, weights=None, returned=None, *, keepdims=None): return (a, weights) From 4b079c8d5d1ed7357507b33dd2b37d02f745f828 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 24 Jul 2023 22:14:55 +0200 Subject: [PATCH 02/40] ENH use _weights_are_valid in np.average --- numpy/lib/function_base.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index f4df88604e15..e12e92f0a467 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -547,30 +547,13 @@ def average(a, axis=None, weights=None, returned=False, *, avg_as_array = np.asanyarray(avg) scl = avg_as_array.dtype.type(a.size/avg_as_array.size) else: - wgt = np.asanyarray(weights) + wgt = _weights_are_valid(weights=weights, a=a, axis=axis) if issubclass(a.dtype.type, (np.integer, np.bool_)): result_dtype = np.result_type(a.dtype, wgt.dtype, 'f8') else: result_dtype = np.result_type(a.dtype, wgt.dtype) - # Sanity checks - if a.shape != wgt.shape: - if axis is None: - raise TypeError( - "Axis must be specified when shapes of a and weights " - "differ.") - if wgt.ndim != 1: - raise TypeError( - "1D weights expected when shapes of a and weights differ.") - if wgt.shape[0] != a.shape[axis]: - raise ValueError( - "Length of weights not compatible with specified axis.") - - # setup wgt to broadcast along axis - wgt = np.broadcast_to(wgt, (a.ndim-1)*(1,) + wgt.shape) - wgt = wgt.swapaxes(-1, axis) - scl = wgt.sum(axis=axis, dtype=result_dtype, **keepdims_kw) if np.any(scl == 0.0): raise ZeroDivisionError( From 7f10b12d46bb27188b351402f2d2180cc3ad10f3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 24 Jul 2023 22:16:29 +0200 Subject: [PATCH 03/40] TST add test_quantile_constant_weights --- numpy/lib/tests/test_function_base.py | 38 ++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index b0944ec85a5d..f7d18908ba22 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3661,16 +3661,17 @@ def test_quantile_scalar_nan(self): assert np.isscalar(actual) assert_equal(np.quantile(a, 0.5), np.nan) + @pytest.mark.parametrize("weights", [False, True]) @pytest.mark.parametrize("method", quantile_methods) @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9]) - def test_quantile_identification_equation(self, method, alpha): + def test_quantile_identification_equation(self, weights, method, alpha): # Test that the identification equation holds for the empirical # CDF: # E[V(x, Y)] = 0 <=> x is quantile # with Y the random variable for which we have observed values and # V(x, y) the canonical identification function for the quantile (at # level alpha), see - # https://doi.org/10.48550/arXiv.0912.0902 + # https://doi.org/10.48550/arXiv.0912.0902 rng = np.random.default_rng(4321) # We choose n and alpha such that we cover 3 cases: # - n * alpha is an integer @@ -3678,17 +3679,22 @@ def test_quantile_identification_equation(self, method, alpha): # - n * alpha is a float that gest rounded up n = 102 # n * alpha = 20.4, 51. , 91.8 y = rng.random(n) - x = np.quantile(y, alpha, method=method) + w = rng.integers(low=0, high=10, size=n) if weights else None + if weights and method not in ["inverted_cdf"]: + pytest.skip("Weights not supported by method.") + else: + x = np.quantile(y, alpha, method=method, weights=w) + if method in ("higher",): # These methods do not fulfill the identification equation. assert np.abs(np.mean(self.V(x, y, alpha))) > 0.1 / n - elif int(n * alpha) == n * alpha: + elif int(n * alpha) == n * alpha and not weights: # We can expect exact results, up to machine precision. - assert_allclose(np.mean(self.V(x, y, alpha)), 0, atol=1e-14) + assert_allclose(np.average(self.V(x, y, alpha), weights=w), 0, atol=1e-14) else: # V = (x >= y) - alpha cannot sum to zero exactly but within # "sample precision". - assert_allclose(np.mean(self.V(x, y, alpha)), 0, + assert_allclose(np.average(self.V(x, y, alpha), weights=w), 0, atol=1 / n / np.amin([alpha, 1 - alpha])) @pytest.mark.parametrize("method", quantile_methods) @@ -3749,6 +3755,26 @@ def test_quantile_add_and_multiply_constant(self, method, alpha): # "median_unbiased", "normal_unbiased", "midpoint" assert_allclose(q, np.quantile(y, alpha, method=method)) + @pytest.mark.parametrize("method", ["inverted_cdf"]) + @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9]) + def test_quantile_constant_weights(self, method, alpha): + rng = np.random.default_rng(4321) + # We choose n and alpha such that we have cases for + # - n * alpha is an integer + # - n * alpha is a float that gets rounded down + # - n * alpha is a float that gest rounded up + n = 102 # n * alpha = 20.4, 51. , 91.8 + y = rng.random(n) + q = np.quantile(y, alpha, method=method) + + w = np.ones_like(y) + qw = np.quantile(y, alpha, method=method, weights=w) + assert_allclose(qw, q) + + w = 8.125 * np.ones_like(y) + qw = np.quantile(y, alpha, method=method, weights=w) + assert_allclose(qw, q) + class TestLerp: @hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False, From 0fce6b6ee2dc49841a004cbcdda2435c1482558f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 24 Jul 2023 22:20:08 +0200 Subject: [PATCH 04/40] ENH add weighted quantile for inverted_cdf --- numpy/lib/function_base.py | 172 ++++++++++++++++++++++++++----------- 1 file changed, 121 insertions(+), 51 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index e12e92f0a467..d3bf75d90b6a 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4265,8 +4265,9 @@ def percentile(a, def _quantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, - method=None, keepdims=None, *, interpolation=None): - return (a, q, out) + method=None, keepdims=None, *, weights=None, + interpolation=None): + return (a, q, out, weights) @array_function_dispatch(_quantile_dispatcher) @@ -4278,6 +4279,7 @@ def quantile(a, method="linear", keepdims=False, *, + weights=None, interpolation=None): """ Compute the q-th quantile of the data along the specified axis. @@ -4336,6 +4338,16 @@ def quantile(a, the result as dimensions with size one. With this option, the result will broadcast correctly against the original array `a`. + weights : array_like, optional + An array of weights associated with the values in `a`. Each value in + `a` contributes to the quantile according to its associated weight. + The weights array can either be 1-D (in which case its length must be + the size of `a` along the given axis) or of the same shape as `a`. + If `weights=None`, then all data in `a` are assumed to have a + weight equal to one. + Only `method="inverted_cdf"` supports weights. + See the notes for more details. + interpolation : str, optional Deprecated name for the method keyword argument. @@ -4362,7 +4374,21 @@ def quantile(a, Notes ----- - Given a vector ``V`` of length ``n``, the q-th quantile of ``V`` is + In general, the quantile at probability level math:`q` of a cumulative + probability distribution :math:`P` is defined as any number :math:`x` + that fulfills the *coverage conditions* + + .. math:: P(Y < x) \\leq q \\quad\\text{and}\\quad P(Y \\leq x) \geq q + + with random variable :math:`Y\sim P`. + For empirical quantiles as considered here, :math:`P` is the empirical + distribution function of the given data vector ``a`` of length ``n``, + i.e. :math:`P(Y \\leq t) = \\frac{1}{n} \\sum_i 1_{a_i \\leq t}`. + Then, the different methods correspond to different choices of :math:`x` + that fulfil the above inequalities. + + To be more concrete, given a vector ``V`` of length ``n``, the empirical + q-th quantile of ``V`` is the value ``q`` of the way from the minimum to the maximum in a sorted copy of ``V``. The values and distances of the two nearest neighbors as well as the `method` parameter will determine the @@ -4474,6 +4500,13 @@ def quantile(a, NumPy method kept for backwards compatibility. Uses ``(i + j) / 2``. + **Weighted quantiles:** + For weighted quantiles, the above coverage conditions still hold. The + empirical cumulative distribution is simply replaced by its weighted + version, i.e. + :math:`P(Y \\leq t) = \\frac{1}{\\sum_i w_i} \\sum_i w_i 1_{x_i \\leq t}`. + Only ``method="inverted_cdf"`` supports weights. + Examples -------- >>> a = np.array([[10, 7, 4], [3, 2, 1]]) @@ -4520,8 +4553,14 @@ def quantile(a, q = np.asanyarray(q) if not _quantile_is_valid(q): raise ValueError("Quantiles must be in the range [0, 1]") + + if weights is not None: + if method != "inverted_cdf": + raise ValueError("Only method 'inverted_cdf' supports weights.") + weights = _weights_are_valid(weights=weights, a=a, axis=axis) + return _quantile_unchecked( - a, q, axis, out, overwrite_input, method, keepdims) + a, q, axis, out, overwrite_input, method, keepdims, weights) def _quantile_unchecked(a, @@ -4530,11 +4569,13 @@ def _quantile_unchecked(a, out=None, overwrite_input=False, method="linear", - keepdims=False): + keepdims=False, + weights=None): """Assumes that q is in [0, 1], and is an ndarray""" return _ureduce(a, func=_quantile_ureduce_func, q=q, + weights=weights, keepdims=keepdims, axis=axis, out=out, @@ -4676,6 +4717,7 @@ def _inverted_cdf(n, quantiles): def _quantile_ureduce_func( a: np.array, q: np.array, + weights: np.array, axis: int = None, out=None, overwrite_input: bool = False, @@ -4690,19 +4732,24 @@ def _quantile_ureduce_func( if axis is None: axis = 0 arr = a.ravel() + wgt = None if weights is None else weights.ravel() else: arr = a + wgt = weights else: if axis is None: axis = 0 arr = a.flatten() + wgt = None if weights is None else weights.flatten() else: arr = a.copy() + wgt = weights result = _quantile(arr, quantiles=q, axis=axis, method=method, - out=out) + out=out, + weights=wgt) return result @@ -4747,6 +4794,7 @@ def _quantile( axis: int = -1, method="linear", out=None, + weights=None, ): """ Private function that doesn't support extended axis or keepdims. @@ -4772,53 +4820,75 @@ def _quantile( # Index where to find the value in the sorted array. # Virtual because it is a floating point value, not an valid index. # The nearest neighbours are used for interpolation - try: - method = _QuantileMethods[method] - except KeyError: - raise ValueError( - f"{method!r} is not a valid method. Use one of: " - f"{_QuantileMethods.keys()}") from None - virtual_indexes = method["get_virtual_index"](values_count, quantiles) - virtual_indexes = np.asanyarray(virtual_indexes) - if np.issubdtype(virtual_indexes.dtype, np.integer): - # No interpolation needed, take the points along axis - if np.issubdtype(arr.dtype, np.inexact): - # may contain nan, which would sort to the end - arr.partition(concatenate((virtual_indexes.ravel(), [-1])), axis=0) - slices_having_nans = np.isnan(arr[-1]) + if weights is None: + try: + method = _QuantileMethods[method] + except KeyError: + raise ValueError( + f"{method!r} is not a valid method. Use one of: " + f"{_QuantileMethods.keys()}") from None + virtual_indexes = method["get_virtual_index"](values_count, quantiles) + virtual_indexes = np.asanyarray(virtual_indexes) + if np.issubdtype(virtual_indexes.dtype, np.integer): + # No interpolation needed, take the points along axis + if np.issubdtype(arr.dtype, np.inexact): + # may contain nan, which would sort to the end + arr.partition(concatenate((virtual_indexes.ravel(), [-1])), axis=0) + slices_having_nans = np.isnan(arr[-1]) + else: + # cannot contain nan + arr.partition(virtual_indexes.ravel(), axis=0) + slices_having_nans = np.array(False, dtype=bool) + result = take(arr, virtual_indexes, axis=0, out=out) else: - # cannot contain nan - arr.partition(virtual_indexes.ravel(), axis=0) - slices_having_nans = np.array(False, dtype=bool) - result = take(arr, virtual_indexes, axis=0, out=out) + previous_indexes, next_indexes = _get_indexes(arr, + virtual_indexes, + values_count) + # --- Sorting + arr.partition( + np.unique(np.concatenate(([0, -1], + previous_indexes.ravel(), + next_indexes.ravel(), + ))), + axis=DATA_AXIS) + if np.issubdtype(arr.dtype, np.inexact): + slices_having_nans = np.isnan( + take(arr, indices=-1, axis=DATA_AXIS) + ) + else: + slices_having_nans = None + # --- Get values from indexes + previous = np.take(arr, previous_indexes, axis=DATA_AXIS) + next = np.take(arr, next_indexes, axis=DATA_AXIS) + # --- Linear interpolation + gamma = _get_gamma(virtual_indexes, previous_indexes, method) + result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) + gamma = gamma.reshape(result_shape) + result = _lerp(previous, + next, + gamma, + out=out) else: - previous_indexes, next_indexes = _get_indexes(arr, - virtual_indexes, - values_count) - # --- Sorting - arr.partition( - np.unique(np.concatenate(([0, -1], - previous_indexes.ravel(), - next_indexes.ravel(), - ))), - axis=DATA_AXIS) - if np.issubdtype(arr.dtype, np.inexact): - slices_having_nans = np.isnan( - take(arr, indices=-1, axis=DATA_AXIS) - ) - else: - slices_having_nans = None - # --- Get values from indexes - previous = np.take(arr, previous_indexes, axis=DATA_AXIS) - next = np.take(arr, next_indexes, axis=DATA_AXIS) - # --- Linear interpolation - gamma = _get_gamma(virtual_indexes, previous_indexes, method) - result_shape = virtual_indexes.shape + (1,) * (arr.ndim - 1) - gamma = gamma.reshape(result_shape) - result = _lerp(previous, - next, - gamma, - out=out) + # Weighted case, we need to sort anyway. + weights = np.asanyarray(weights) + if axis != DATA_AXIS: + weights = np.moveaxis(weights, axis, destination=DATA_AXIS) + index_array = np.argsort(arr, axis=axis, kind="stable") + # TODO: Which value to set to slices_having_nans. + slices_having_nans = None + # TODO: Deal with nan values, e.g. np.isnan(arr(index_array[-1])) by + # be true. + # TODO: Special case quantiles == 0 and quantiles == 0 + arr = np.take_along_axis(arr, index_array, axis=axis) + weights = np.take_along_axis(weights, index_array, axis=axis) + # Normalize weights to sum to one such that it corresponds to the + # empirical distribution function. + weights = weights / weights.sum(axis=axis) + # Only inverted_cdf is supported. Search index i such that + # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) + i = np.searchsorted(weights.cumsum(axis=axis), quantiles, side="left") + result = take(arr, i, axis=0, out=out) + if np.any(slices_having_nans): if result.ndim == 0 and out is None: # can't write to a scalar From 3c979bb4d0d1d832aa681a0f8868524791720e6f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 25 Jul 2023 07:54:31 +0200 Subject: [PATCH 05/40] CLN satisfy linter --- numpy/lib/function_base.py | 18 ++++++++++-------- numpy/lib/tests/test_function_base.py | 4 +++- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 51be5f90c974..37cb57e3b64f 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4371,9 +4371,9 @@ def quantile(a, probability distribution :math:`P` is defined as any number :math:`x` that fulfills the *coverage conditions* - .. math:: P(Y < x) \\leq q \\quad\\text{and}\\quad P(Y \\leq x) \geq q + .. math:: P(Y < x) \\leq q \\quad\\text{and}\\quad P(Y \\leq x) \\geq q - with random variable :math:`Y\sim P`. + with random variable :math:`Y\\sim P`. For empirical quantiles as considered here, :math:`P` is the empirical distribution function of the given data vector ``a`` of length ``n``, i.e. :math:`P(Y \\leq t) = \\frac{1}{n} \\sum_i 1_{a_i \\leq t}`. @@ -4830,7 +4830,9 @@ def _quantile( # No interpolation needed, take the points along axis if supports_nans: # may contain nan, which would sort to the end - arr.partition(concatenate((virtual_indexes.ravel(), [-1])), axis=0) + arr.partition( + concatenate((virtual_indexes.ravel(), [-1])), axis=0, + ) slices_having_nans = np.isnan(arr[-1, ...]) else: # cannot contain nan @@ -4839,14 +4841,14 @@ def _quantile( result = take(arr, virtual_indexes, axis=0, out=out) else: previous_indexes, next_indexes = _get_indexes(arr, - virtual_indexes, - values_count) + virtual_indexes, + values_count) # --- Sorting arr.partition( np.unique(np.concatenate(([0, -1], - previous_indexes.ravel(), - next_indexes.ravel(), - ))), + previous_indexes.ravel(), + next_indexes.ravel(), + ))), axis=0) if supports_nans: slices_having_nans = np.isnan(arr[-1, ...]) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 72a354bb5402..a3983466b845 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3706,7 +3706,9 @@ def test_quantile_identification_equation(self, weights, method, alpha): assert np.abs(np.mean(self.V(x, y, alpha))) > 0.1 / n elif int(n * alpha) == n * alpha and not weights: # We can expect exact results, up to machine precision. - assert_allclose(np.average(self.V(x, y, alpha), weights=w), 0, atol=1e-14) + assert_allclose( + np.average(self.V(x, y, alpha), weights=w), 0, atol=1e-14, + ) else: # V = (x >= y) - alpha cannot sum to zero exactly but within # "sample precision". From bad403222d2d6984d754da02063ee44bda613bf8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 25 Jul 2023 08:16:37 +0200 Subject: [PATCH 06/40] ENH add weights to nanquantile --- numpy/lib/nanfunctions.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index ef0f6cf65b03..8b4874b3e5d5 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -1400,6 +1400,7 @@ def nanquantile( method="linear", keepdims=np._NoValue, *, + weights=None, interpolation=None, ): """ @@ -1468,6 +1469,15 @@ def nanquantile( a sub-class and `mean` does not have the kwarg `keepdims` this will raise a RuntimeError. + weights : array_like, optional + An array of weights associated with the values in `a`. Each value in + `a` contributes to the quantile according to its associated weight. + The weights array can either be 1-D (in which case its length must be + the size of `a` along the given axis) or of the same shape as `a`. + If `weights=None`, then all data in `a` are assumed to have a + weight equal to one. + Only `method="inverted_cdf"` supports weights. + interpolation : str, optional Deprecated name for the method keyword argument. @@ -1543,7 +1553,7 @@ def nanquantile( if not function_base._quantile_is_valid(q): raise ValueError("Quantiles must be in the range [0, 1]") return _nanquantile_unchecked( - a, q, axis, out, overwrite_input, method, keepdims) + a, q, axis, out, overwrite_input, method, keepdims, weights) def _nanquantile_unchecked( @@ -1554,6 +1564,7 @@ def _nanquantile_unchecked( overwrite_input=False, method="linear", keepdims=np._NoValue, + weights=None, ): """Assumes that q is in [0, 1], and is an ndarray""" # apply_along_axis in _nanpercentile doesn't handle empty arrays well, @@ -1567,11 +1578,12 @@ def _nanquantile_unchecked( axis=axis, out=out, overwrite_input=overwrite_input, - method=method) + method=method, + weights=weights) def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, - method="linear"): + method="linear", weights=None): """ Private function that doesn't support extended axis or keepdims. These methods are extended to this function using _ureduce @@ -1579,10 +1591,10 @@ def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, """ if axis is None or a.ndim == 1: part = a.ravel() - result = _nanquantile_1d(part, q, overwrite_input, method) + result = _nanquantile_1d(part, q, overwrite_input, method, weights) else: result = np.apply_along_axis(_nanquantile_1d, axis, a, q, - overwrite_input, method) + overwrite_input, method, weights) # apply_along_axis fills in collapsed axis with results. # Move that axis to the beginning to match percentile's # convention. @@ -1594,7 +1606,7 @@ def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, return result -def _nanquantile_1d(arr1d, q, overwrite_input=False, method="linear"): +def _nanquantile_1d(arr1d, q, overwrite_input=False, method="linear", weights=None): """ Private function for rank 1 arrays. Compute quantile ignoring NaNs. See nanpercentile for parameter usage @@ -1606,7 +1618,7 @@ def _nanquantile_1d(arr1d, q, overwrite_input=False, method="linear"): return np.full(q.shape, np.nan, dtype=arr1d.dtype)[()] return function_base._quantile_unchecked( - arr1d, q, overwrite_input=overwrite_input, method=method) + arr1d, q, overwrite_input=overwrite_input, method=method, weights=weights) def _nanvar_dispatcher(a, axis=None, dtype=None, out=None, ddof=None, From d84654ef7d428facceb8ff3e8c704fbf181ec86f Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 25 Jul 2023 08:19:28 +0200 Subject: [PATCH 07/40] CLN make linter happy --- numpy/lib/nanfunctions.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py index 8b4874b3e5d5..72ef659110ea 100644 --- a/numpy/lib/nanfunctions.py +++ b/numpy/lib/nanfunctions.py @@ -1606,7 +1606,9 @@ def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, return result -def _nanquantile_1d(arr1d, q, overwrite_input=False, method="linear", weights=None): +def _nanquantile_1d( + arr1d, q, overwrite_input=False, method="linear", weights=None, +): """ Private function for rank 1 arrays. Compute quantile ignoring NaNs. See nanpercentile for parameter usage @@ -1618,7 +1620,12 @@ def _nanquantile_1d(arr1d, q, overwrite_input=False, method="linear", weights=No return np.full(q.shape, np.nan, dtype=arr1d.dtype)[()] return function_base._quantile_unchecked( - arr1d, q, overwrite_input=overwrite_input, method=method, weights=weights) + arr1d, + q, + overwrite_input=overwrite_input, + method=method, + weights=weights, + ) def _nanvar_dispatcher(a, axis=None, dtype=None, out=None, ddof=None, From 24f758f835bef816bc342a38296523a1a1ee56eb Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 25 Jul 2023 18:53:54 +0200 Subject: [PATCH 08/40] TST add test_quantile_with_integer_weights --- numpy/lib/function_base.py | 6 +++- numpy/lib/tests/test_function_base.py | 47 ++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 37cb57e3b64f..5bb0bbf64ddd 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4867,6 +4867,7 @@ def _quantile( out=out) else: # Weighted case, we need to sort anyway. + # This implements method="inverted_cdf". weights = np.asanyarray(weights) if axis != 0: weights = np.moveaxis(weights, axis, destination=0) @@ -4875,7 +4876,7 @@ def _quantile( slices_having_nans = None # TODO: Deal with nan values, e.g. np.isnan(arr(index_array[-1])) by # be true. - # TODO: Special case quantiles == 0 and quantiles == 0 + # TODO: Special case quantiles == 0 and quantiles == 1? arr = np.take_along_axis(arr, index_array, axis=axis) weights = np.take_along_axis(weights, index_array, axis=axis) # Normalize weights to sum to one such that it corresponds to the @@ -4884,6 +4885,9 @@ def _quantile( # Only inverted_cdf is supported. Search index i such that # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) i = np.searchsorted(weights.cumsum(axis=axis), quantiles, side="left") + # We might have reached the maximum with i = len(arr), e.g. for + # quantile = 1. + i = np.minimum(i, values_count - 1) result = take(arr, i, axis=0, out=out) if np.any(slices_having_nans): diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index a3983466b845..39b54fba71b2 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3561,6 +3561,9 @@ def test_nat_basic(self, dtype, pos): 'midpoint'] +methods_supporting_weights = ["inverted_cdf"] + + class TestQuantile: # most of this is already tested by TestPercentile @@ -3688,6 +3691,8 @@ def test_quantile_identification_equation(self, weights, method, alpha): # V(x, y) the canonical identification function for the quantile (at # level alpha), see # https://doi.org/10.48550/arXiv.0912.0902 + if weights and method not in methods_supporting_weights: + pytest.skip("Weights not supported by method.") rng = np.random.default_rng(4321) # We choose n and alpha such that we cover 3 cases: # - n * alpha is an integer @@ -3696,10 +3701,7 @@ def test_quantile_identification_equation(self, weights, method, alpha): n = 102 # n * alpha = 20.4, 51. , 91.8 y = rng.random(n) w = rng.integers(low=0, high=10, size=n) if weights else None - if weights and method not in ["inverted_cdf"]: - pytest.skip("Weights not supported by method.") - else: - x = np.quantile(y, alpha, method=method, weights=w) + x = np.quantile(y, alpha, method=method, weights=w) if method in ("higher",): # These methods do not fulfill the identification equation. @@ -3715,9 +3717,10 @@ def test_quantile_identification_equation(self, weights, method, alpha): assert_allclose(np.average(self.V(x, y, alpha), weights=w), 0, atol=1 / n / np.amin([alpha, 1 - alpha])) + @pytest.mark.parametrize("weights", [False, True]) @pytest.mark.parametrize("method", quantile_methods) @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9]) - def test_quantile_add_and_multiply_constant(self, method, alpha): + def test_quantile_add_and_multiply_constant(self, weights, method, alpha): # Test that # 1. quantile(c + x) = c + quantile(x) # 2. quantile(c * x) = c * quantile(x) @@ -3725,6 +3728,8 @@ def test_quantile_add_and_multiply_constant(self, method, alpha): # On empirical quantiles, this equation does not hold exactly. # Koenker (2005) "Quantile Regression" Chapter 2.2.3 calls these # properties equivariance. + if weights and method not in methods_supporting_weights: + pytest.skip("Weights not supported by method.") rng = np.random.default_rng(4321) # We choose n and alpha such that we have cases for # - n * alpha is an integer @@ -3732,14 +3737,20 @@ def test_quantile_add_and_multiply_constant(self, method, alpha): # - n * alpha is a float that gest rounded up n = 102 # n * alpha = 20.4, 51. , 91.8 y = rng.random(n) - q = np.quantile(y, alpha, method=method) + w = rng.integers(low=0, high=10, size=n) if weights else None + q = np.quantile(y, alpha, method=method, weights=w) c = 13.5 # 1 - assert_allclose(np.quantile(c + y, alpha, method=method), c + q) + assert_allclose(np.quantile(c + y, alpha, method=method, weights=w), + c + q) # 2 - assert_allclose(np.quantile(c * y, alpha, method=method), c * q) + assert_allclose(np.quantile(c * y, alpha, method=method, weights=w), + c * q) # 3 + if weights: + # From here on, we would need more methods to support weights. + return q = -np.quantile(-y, 1 - alpha, method=method) if method == "inverted_cdf": if ( @@ -3773,7 +3784,7 @@ def test_quantile_add_and_multiply_constant(self, method, alpha): # "median_unbiased", "normal_unbiased", "midpoint" assert_allclose(q, np.quantile(y, alpha, method=method)) - @pytest.mark.parametrize("method", ["inverted_cdf"]) + @pytest.mark.parametrize("method", methods_supporting_weights) @pytest.mark.parametrize("alpha", [0.2, 0.5, 0.9]) def test_quantile_constant_weights(self, method, alpha): rng = np.random.default_rng(4321) @@ -3794,6 +3805,24 @@ def test_quantile_constant_weights(self, method, alpha): assert_allclose(qw, q) + @pytest.mark.parametrize("method", methods_supporting_weights) + @pytest.mark.parametrize("alpha", [0, 0.2, 0.5, 0.9, 1]) + def test_quantile_with_integer_weights(self, method, alpha): + # Integer weights can be interpreted as repeated observations. + rng = np.random.default_rng(4321) + # We choose n and alpha such that we have cases for + # - n * alpha is an integer + # - n * alpha is a float that gets rounded down + # - n * alpha is a float that gest rounded up + n = 102 # n * alpha = 20.4, 51. , 91.8 + y = rng.random(n) + w = rng.integers(low=0, high=10, size=n) + + qw = np.quantile(y, alpha, method=method, weights=w) + q = np.quantile(np.repeat(y, w), alpha, method=method) + assert_allclose(qw, q) + + class TestLerp: @hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False, min_value=0, max_value=1), From cb48fd1f648122d4ec49aebd6987b3cd9dee4b91 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Tue, 25 Jul 2023 18:55:46 +0200 Subject: [PATCH 09/40] CLN remove blank line --- numpy/lib/tests/test_function_base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 39b54fba71b2..2a1481c14787 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3804,7 +3804,6 @@ def test_quantile_constant_weights(self, method, alpha): qw = np.quantile(y, alpha, method=method, weights=w) assert_allclose(qw, q) - @pytest.mark.parametrize("method", methods_supporting_weights) @pytest.mark.parametrize("alpha", [0, 0.2, 0.5, 0.9, 1]) def test_quantile_with_integer_weights(self, method, alpha): From eee5455cb7ebe159d68c89d719a92ed2735e6a6a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 4 Aug 2023 11:18:06 +0200 Subject: [PATCH 10/40] ENH replace take_along_axis with normal indexing --- numpy/lib/function_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 5bb0bbf64ddd..6e2e82ed370a 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4877,8 +4877,8 @@ def _quantile( # TODO: Deal with nan values, e.g. np.isnan(arr(index_array[-1])) by # be true. # TODO: Special case quantiles == 0 and quantiles == 1? - arr = np.take_along_axis(arr, index_array, axis=axis) - weights = np.take_along_axis(weights, index_array, axis=axis) + arr = arr[index_array, ...] + weights = weights[index_array, ...] # Normalize weights to sum to one such that it corresponds to the # empirical distribution function. weights = weights / weights.sum(axis=axis) From cff8df330c1334bcd5715badfc7bdc979c1b7f24 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 4 Aug 2023 11:24:51 +0200 Subject: [PATCH 11/40] ENH use cumsum directly to calc CDF --- numpy/lib/function_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index 6e2e82ed370a..85ce5d16f01d 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4879,12 +4879,12 @@ def _quantile( # TODO: Special case quantiles == 0 and quantiles == 1? arr = arr[index_array, ...] weights = weights[index_array, ...] - # Normalize weights to sum to one such that it corresponds to the - # empirical distribution function. - weights = weights / weights.sum(axis=axis) + # We use the weights to calculate the empirical distribution function. + cdf = weights.cumsum(axis=axis, dtype=float) + cdf /= cdf[-1, ...] # normalization # Only inverted_cdf is supported. Search index i such that # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) - i = np.searchsorted(weights.cumsum(axis=axis), quantiles, side="left") + i = np.searchsorted(cdf, quantiles, side="left") # We might have reached the maximum with i = len(arr), e.g. for # quantile = 1. i = np.minimum(i, values_count - 1) From fa2e54a63ab98701698f006db42308a1eb452369 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 4 Aug 2023 11:27:36 +0200 Subject: [PATCH 12/40] FIX try np.intp to fix emscripten CI --- numpy/lib/tests/test_function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 2a1481c14787..2c25f1de8db4 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3815,7 +3815,7 @@ def test_quantile_with_integer_weights(self, method, alpha): # - n * alpha is a float that gest rounded up n = 102 # n * alpha = 20.4, 51. , 91.8 y = rng.random(n) - w = rng.integers(low=0, high=10, size=n) + w = rng.integers(low=0, high=10, size=n, dtype=np.intp) qw = np.quantile(y, alpha, method=method, weights=w) q = np.quantile(np.repeat(y, w), alpha, method=method) From 93407f7987188dc7d2fe79536d6a1237d95378b3 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 4 Aug 2023 11:49:00 +0200 Subject: [PATCH 13/40] FIX try np.int32 to fix CI error for np.repeat --- numpy/lib/tests/test_function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 2c25f1de8db4..6f1a87fe2682 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3815,7 +3815,7 @@ def test_quantile_with_integer_weights(self, method, alpha): # - n * alpha is a float that gest rounded up n = 102 # n * alpha = 20.4, 51. , 91.8 y = rng.random(n) - w = rng.integers(low=0, high=10, size=n, dtype=np.intp) + w = rng.integers(low=0, high=10, size=n, dtype=np.int32) qw = np.quantile(y, alpha, method=method, weights=w) q = np.quantile(np.repeat(y, w), alpha, method=method) From 7c3bec89b72d249bf40a9d5ecc2a4146156e5f95 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 4 Aug 2023 18:20:39 +0200 Subject: [PATCH 14/40] TST add test for raising errors --- numpy/lib/function_base.py | 2 ++ numpy/lib/tests/test_function_base.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py index a8a2747a6c21..9b30552188c5 100644 --- a/numpy/lib/function_base.py +++ b/numpy/lib/function_base.py @@ -4563,6 +4563,8 @@ def quantile(a, if method != "inverted_cdf": raise ValueError("Only method 'inverted_cdf' supports weights.") weights = _weights_are_valid(weights=weights, a=a, axis=axis) + if np.any(weights < 0): + raise ValueError("Weights must be non-negative.") return _quantile_unchecked( a, q, axis, out, overwrite_input, method, keepdims, weights) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 6f1a87fe2682..302a6542b388 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3821,6 +3821,23 @@ def test_quantile_with_integer_weights(self, method, alpha): q = np.quantile(np.repeat(y, w), alpha, method=method) assert_allclose(qw, q) + def test_quantile_weights_raises_negative_weights(self): + y = [1, 2] + w = [-0.5, 1] + with pytest.raises(ValueError, match="Weights must be non-negative"): + np.quantile(y, 0.5, weights=w, method="inverted_cdf") + + @pytest.mark.parametrize( + "method", + list(set(quantile_methods) - set(methods_supporting_weights)), + ) + def test_quantile_weights_raises_unsupported_methods(self, method): + y = [1, 2] + w = [0.5, 1] + msg = "Only method 'inverted_cdf' supports weights" + with pytest.raises(ValueError, match=msg): + np.quantile(y, 0.5, weights=w, method=method) + class TestLerp: @hypothesis.given(t0=st.floats(allow_nan=False, allow_infinity=False, From 574ea98f0065e1691b29b34024425623e26492e5 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 4 Sep 2023 18:54:43 +0200 Subject: [PATCH 15/40] TST add test_quantile_with_weights_and_axis --- numpy/lib/_function_base_impl.py | 41 ++++++++++++++++++------ numpy/lib/tests/test_function_base.py | 46 +++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 458a2c7c1180..77f6f7513f69 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4833,24 +4833,47 @@ def _quantile( weights = np.asanyarray(weights) if axis != 0: weights = np.moveaxis(weights, axis, destination=0) - index_array = np.argsort(arr, axis=axis, kind="stable") + index_array = np.argsort(arr, axis=0, kind="stable") # TODO: Which value to set to slices_having_nans. slices_having_nans = None # TODO: Deal with nan values, e.g. np.isnan(arr(index_array[-1])) by # be true. # TODO: Special case quantiles == 0 and quantiles == 1? - arr = arr[index_array, ...] - weights = weights[index_array, ...] + + # arr = arr[index_array, ...] # but this adds trailing dimensions of + # 1. + arr = np.take_along_axis(arr, index_array, axis=0) + if weights.shape == arr.shape: + weights = np.take_along_axis(weights, index_array, axis=0) + else: + # weights is 1d + weights = weights.reshape(-1)[index_array, ...] + # weights = np.take_along_axis(weights, index_array, axis=axis) # We use the weights to calculate the empirical distribution function. - cdf = weights.cumsum(axis=axis, dtype=float) + cdf = weights.cumsum(axis=0, dtype=float) cdf /= cdf[-1, ...] # normalization # Only inverted_cdf is supported. Search index i such that # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) - i = np.searchsorted(cdf, quantiles, side="left") - # We might have reached the maximum with i = len(arr), e.g. for - # quantile = 1. - i = np.minimum(i, values_count - 1) - result = take(arr, i, axis=0, out=out) + + def find_cdf_1d(arr, cdf): + i = np.searchsorted(cdf, quantiles, side="left") + # We might have reached the maximum with i = len(arr), e.g. for + # quantiles == 1. + i = np.minimum(i, values_count - 1) + result = take(arr, i, axis=0, out=out) + return result + + r_shape = arr.shape[1:] + if quantiles.ndim > 0: + r_shape += quantiles.shape + result = np.empty_like(arr, shape=r_shape) + # See apply_along_axis, which we do for axis=0. Note that Ni = (,) + # always, so we remove it here. + Nk = arr.shape[1:] + for kk in np.ndindex(Nk): + result[kk + np.s_[...,]] = find_cdf_1d( + arr[np.s_[:,] + kk], cdf[np.s_[:,] + kk] + ) if np.any(slices_having_nans): if result.ndim == 0 and out is None: diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 14c723df3a3b..74c34d35ec20 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3807,6 +3807,52 @@ def test_quantile_with_integer_weights(self, method, alpha): q = np.quantile(np.repeat(y, w), alpha, method=method) assert_allclose(qw, q) + @pytest.mark.parametrize("method", methods_supporting_weights) + def test_quantile_with_weights_and_axis(self, method): + rng = np.random.default_rng(4321) + + # 1d weight and single alpha + y = rng.random((2, 10, 3)) + w = np.abs(rng.random(10)) + alpha = 0.5 + q = np.quantile(y, alpha, weights=w, method=method, axis=1) + q_res = np.zeros(shape=(2, 3)) + for i in range(2): + for j in range(3): + q_res[i, j] = np.quantile( + y[i, :, j], alpha, method=method, weights=w + ) + assert_allclose(q, q_res) + + # 1d weight and 1d alpha + alpha = [0.25, 0.5, 0.75, 1] # shape (4,) + q = np.quantile(y, alpha, weights=w, method=method, axis=1) + q_res = np.zeros(shape=(2, 3, 4)) + for i in range(2): + for j in range(3): + q_res[i, j, :] = np.quantile( + y[i, :, j], alpha, method=method, weights=w + ) + assert_allclose(q, q_res) + + # 1d weight and 2d alpha + alpha = [[0.25, 0.5], [0.75, 1]] # shape (2, 2) + q = np.quantile(y, alpha, weights=w, method=method, axis=1) + q_res = q_res.reshape((2, 3, 2, 2)) + assert_allclose(q, q_res) + + # shape of weights equals shape of y + w = np.abs(rng.random((2, 10, 3))) + alpha = 0.5 + q = np.quantile(y, alpha, weights=w, method=method, axis=1) + q_res = np.zeros(shape=(2, 3)) + for i in range(2): + for j in range(3): + q_res[i, j] = np.quantile( + y[i, :, j], alpha, method=method, weights=w[i, :, j] + ) + assert_allclose(q, q_res) + def test_quantile_weights_raises_negative_weights(self): y = [1, 2] w = [-0.5, 1] From e705958d0c3865f0504f52f469666f815c94c4e9 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 4 Sep 2023 19:00:14 +0200 Subject: [PATCH 16/40] CLN make linter happy again --- numpy/lib/_function_base_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 77f6f7513f69..040384f7c228 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4871,8 +4871,8 @@ def find_cdf_1d(arr, cdf): # always, so we remove it here. Nk = arr.shape[1:] for kk in np.ndindex(Nk): - result[kk + np.s_[...,]] = find_cdf_1d( - arr[np.s_[:,] + kk], cdf[np.s_[:,] + kk] + result[kk + np.s_[..., ]] = find_cdf_1d( + arr[np.s_[:, ] + kk], cdf[np.s_[:, ] + kk] ) if np.any(slices_having_nans): From c4ca281c0051e8328755ad419579b1ef8a25f79b Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 4 Sep 2023 19:18:17 +0200 Subject: [PATCH 17/40] ENH add weights to percentile --- numpy/lib/_function_base_impl.py | 50 ++++++++++++++++++++++++++++++-- numpy/lib/_nanfunctions_impl.py | 33 ++++++++++++++++++++- 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 040384f7c228..9ac322583f14 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -3931,6 +3931,7 @@ def percentile(a, method="linear", keepdims=False, *, + weights=None, interpolation=None): """ Compute the q-th percentile of the data along the specified axis. @@ -3994,6 +3995,18 @@ def percentile(a, .. versionadded:: 1.9.0 + weights : array_like, optional + An array of weights associated with the values in `a`. Each value in + `a` contributes to the percentile according to its associated weight. + The weights array can either be 1-D (in which case its length must be + the size of `a` along the given axis) or of the same shape as `a`. + If `weights=None`, then all data in `a` are assumed to have a + weight equal to one. + Only `method="inverted_cdf"` supports weights. + See the notes for more details. + + .. versionadded:: 2.0.0 + interpolation : str, optional Deprecated name for the method keyword argument. @@ -4020,7 +4033,22 @@ def percentile(a, Notes ----- - Given a vector ``V`` of length ``n``, the q-th percentile of ``V`` is + In general, the percentile at percentage level math:`q` of a cumulative + probability distribution :math:`P` is defined as any number :math:`x` + that fulfills the *coverage conditions* + + .. math:: P(Y < x) \\leq q/100 \\quad\\text{and} + \\quad P(Y \\leq x) \\geq q/100 + + with random variable :math:`Y\\sim P`. + For empirical percentiles as considered here, :math:`P` is the empirical + distribution function of the given data vector ``a`` of length ``n``, + i.e. :math:`P(Y \\leq t) = \\frac{1}{n} \\sum_i 1_{a_i \\leq t}`. + Then, the different methods correspond to different choices of :math:`x` + that fulfil the above inequalities. + + To be more concrete, given a vector ``V`` of length ``n``, the empirical + q-th percentile of ``V`` is the value ``q/100`` of the way from the minimum to the maximum in a sorted copy of ``V``. The values and distances of the two nearest neighbors as well as the `method` parameter will determine the @@ -4132,6 +4160,14 @@ def percentile(a, NumPy method kept for backwards compatibility. Uses ``(i + j) / 2``. + **Weighted percentiles:** + For weighted percentiles, the above coverage conditions still hold. The + empirical cumulative distribution is simply replaced by its weighted + version, i.e. + :math:`P(Y \\leq t) = \\frac{1}{\\sum_i w_i} \\sum_i w_i 1_{x_i \\leq t}`. + Only ``method="inverted_cdf"`` supports weights. + + Examples -------- >>> a = np.array([[10, 7, 4], [3, 2, 1]]) @@ -4213,8 +4249,16 @@ def percentile(a, q = asanyarray(q) # undo any decay that the ufunc performed (see gh-13105) if not _quantile_is_valid(q): raise ValueError("Percentiles must be in the range [0, 100]") + + if weights is not None: + if method != "inverted_cdf": + raise ValueError("Only method 'inverted_cdf' supports weights.") + weights = _weights_are_valid(weights=weights, a=a, axis=axis) + if np.any(weights < 0): + raise ValueError("Weights must be non-negative.") + return _quantile_unchecked( - a, q, axis, out, overwrite_input, method, keepdims) + a, q, axis, out, overwrite_input, method, keepdims, weights) def _quantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @@ -4301,6 +4345,8 @@ def quantile(a, Only `method="inverted_cdf"` supports weights. See the notes for more details. + .. versionadded:: 2.0.0 + interpolation : str, optional Deprecated name for the method keyword argument. diff --git a/numpy/lib/_nanfunctions_impl.py b/numpy/lib/_nanfunctions_impl.py index 99d7dd63a759..2e8991f38f2a 100644 --- a/numpy/lib/_nanfunctions_impl.py +++ b/numpy/lib/_nanfunctions_impl.py @@ -24,6 +24,7 @@ import warnings import numpy as np from numpy.lib import _function_base_impl as fnb +from numpy.lib._function_base_impl import _weights_are_valid from numpy.core import overrides @@ -1234,6 +1235,7 @@ def nanpercentile( method="linear", keepdims=np._NoValue, *, + weights=None, interpolation=None, ): """ @@ -1304,6 +1306,17 @@ def nanpercentile( a sub-class and `mean` does not have the kwarg `keepdims` this will raise a RuntimeError. + weights : array_like, optional + An array of weights associated with the values in `a`. Each value in + `a` contributes to the percentile according to its associated weight. + The weights array can either be 1-D (in which case its length must be + the size of `a` along the given axis) or of the same shape as `a`. + If `weights=None`, then all data in `a` are assumed to have a + weight equal to one. + Only `method="inverted_cdf"` supports weights. + + .. versionadded:: 2.0.0 + interpolation : str, optional Deprecated name for the method keyword argument. @@ -1380,8 +1393,16 @@ def nanpercentile( q = np.asanyarray(q) if not fnb._quantile_is_valid(q): raise ValueError("Percentiles must be in the range [0, 100]") + + if weights is not None: + if method != "inverted_cdf": + raise ValueError("Only method 'inverted_cdf' supports weights.") + weights = _weights_are_valid(weights=weights, a=a, axis=axis) + if np.any(weights < 0): + raise ValueError("Weights must be non-negative.") + return _nanquantile_unchecked( - a, q, axis, out, overwrite_input, method, keepdims) + a, q, axis, out, overwrite_input, method, keepdims, weights) def _nanquantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, @@ -1477,6 +1498,8 @@ def nanquantile( weight equal to one. Only `method="inverted_cdf"` supports weights. + .. versionadded:: 2.0.0 + interpolation : str, optional Deprecated name for the method keyword argument. @@ -1551,6 +1574,14 @@ def nanquantile( q = np.asanyarray(q) if not fnb._quantile_is_valid(q): raise ValueError("Quantiles must be in the range [0, 1]") + + if weights is not None: + if method != "inverted_cdf": + raise ValueError("Only method 'inverted_cdf' supports weights.") + weights = _weights_are_valid(weights=weights, a=a, axis=axis) + if np.any(weights < 0): + raise ValueError("Weights must be non-negative.") + return _nanquantile_unchecked( a, q, axis, out, overwrite_input, method, keepdims, weights) From bf9a767676fe5400b649644fdf4380070f11a1f1 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 4 Sep 2023 20:40:43 +0200 Subject: [PATCH 18/40] TYP add weights to _function_base_impl.pyi --- numpy/lib/_function_base_impl.pyi | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/numpy/lib/_function_base_impl.pyi b/numpy/lib/_function_base_impl.pyi index 1ba5b0762f94..7ff102a3d209 100644 --- a/numpy/lib/_function_base_impl.pyi +++ b/numpy/lib/_function_base_impl.pyi @@ -505,6 +505,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> floating[Any]: ... @overload def percentile( @@ -515,6 +516,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> complexfloating[Any, Any]: ... @overload def percentile( @@ -525,6 +527,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> timedelta64: ... @overload def percentile( @@ -535,6 +538,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> datetime64: ... @overload def percentile( @@ -545,6 +549,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> Any: ... @overload def percentile( @@ -555,6 +560,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[floating[Any]]: ... @overload def percentile( @@ -565,6 +571,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[complexfloating[Any, Any]]: ... @overload def percentile( @@ -575,6 +582,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[timedelta64]: ... @overload def percentile( @@ -585,6 +593,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[datetime64]: ... @overload def percentile( @@ -595,6 +604,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[object_]: ... @overload def percentile( @@ -605,6 +615,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: bool = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> Any: ... @overload def percentile( @@ -615,6 +626,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: bool = ..., + weights: None | _ArrayLikeFloat_co = ..., ) -> _ArrayType: ... # NOTE: Not an alias, but they do have identical signatures From 8b5f249127648c5b95006988523c29ee72865ab6 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 6 Sep 2023 21:04:58 +0200 Subject: [PATCH 19/40] TST sorted test parameters --- numpy/lib/tests/test_function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 74c34d35ec20..08410a10de96 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3861,7 +3861,7 @@ def test_quantile_weights_raises_negative_weights(self): @pytest.mark.parametrize( "method", - list(set(quantile_methods) - set(methods_supporting_weights)), + sorted(set(quantile_methods) - set(methods_supporting_weights)), ) def test_quantile_weights_raises_unsupported_methods(self, method): y = [1, 2] From 980120fde5db97537741421da24972759faffd59 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Wed, 6 Sep 2023 21:06:27 +0200 Subject: [PATCH 20/40] TYP keyword only --- numpy/lib/_function_base_impl.pyi | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/numpy/lib/_function_base_impl.pyi b/numpy/lib/_function_base_impl.pyi index 7ff102a3d209..c2f45a4e749f 100644 --- a/numpy/lib/_function_base_impl.pyi +++ b/numpy/lib/_function_base_impl.pyi @@ -505,6 +505,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> floating[Any]: ... @overload @@ -516,6 +517,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> complexfloating[Any, Any]: ... @overload @@ -527,6 +529,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> timedelta64: ... @overload @@ -538,6 +541,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> datetime64: ... @overload @@ -549,6 +553,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> Any: ... @overload @@ -560,6 +565,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[floating[Any]]: ... @overload @@ -571,6 +577,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[complexfloating[Any, Any]]: ... @overload @@ -582,6 +589,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[timedelta64]: ... @overload @@ -593,6 +601,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[datetime64]: ... @overload @@ -604,6 +613,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: L[False] = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> NDArray[object_]: ... @overload @@ -615,6 +625,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: bool = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> Any: ... @overload @@ -626,6 +637,7 @@ def percentile( overwrite_input: bool = ..., method: _MethodKind = ..., keepdims: bool = ..., + *, weights: None | _ArrayLikeFloat_co = ..., ) -> _ArrayType: ... From 610f8f7cb5b19e7f12856e3c6f9299f2327a61a7 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 8 Dec 2023 12:25:43 +0100 Subject: [PATCH 21/40] DOC add release note entry --- doc/release/upcoming_changes/24254.new_feature.rst | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 doc/release/upcoming_changes/24254.new_feature.rst diff --git a/doc/release/upcoming_changes/24254.new_feature.rst b/doc/release/upcoming_changes/24254.new_feature.rst new file mode 100644 index 000000000000..c63db11da98f --- /dev/null +++ b/doc/release/upcoming_changes/24254.new_feature.rst @@ -0,0 +1,5 @@ +``weights`` option for `quantile` and `percentile` +---------------------------------------------------- +The ``weights`` option is now available for `quantile`, `percentile`, +`nanquantile` and `nanpercentile`. Only ``method="inverted_cdf"`` supports +weights. \ No newline at end of file From fd63fad670a45784a789596a9af348077918800e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 8 Dec 2023 16:20:10 +0100 Subject: [PATCH 22/40] DOC update Notes section of quantile and percentile --- numpy/lib/_function_base_impl.py | 86 ++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 71e47473555a..0f8b316f8358 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4034,28 +4034,33 @@ def percentile(a, Notes ----- In general, the percentile at percentage level math:`q` of a cumulative - probability distribution :math:`P` is defined as any number :math:`x` - that fulfills the *coverage conditions* - + distribution function :math:`F(y)=P(Y \\leq y)` with probability measure + math:`P` is defined as any number :math:`x` that fulfills the + *coverage conditions* + .. math:: P(Y < x) \\leq q/100 \\quad\\text{and} \\quad P(Y \\leq x) \\geq q/100 - + with random variable :math:`Y\\sim P`. - For empirical percentiles as considered here, :math:`P` is the empirical - distribution function of the given data vector ``a`` of length ``n``, - i.e. :math:`P(Y \\leq t) = \\frac{1}{n} \\sum_i 1_{a_i \\leq t}`. - Then, the different methods correspond to different choices of :math:`x` - that fulfil the above inequalities. - - To be more concrete, given a vector ``V`` of length ``n``, the empirical - q-th percentile of ``V`` is - the value ``q/100`` of the way from the minimum to the maximum in a - sorted copy of ``V``. The values and distances of the two nearest - neighbors as well as the `method` parameter will determine the - percentile if the normalized ranking does not match the location of - ``q`` exactly. This function is the same as the median if ``q=50``, the - same as the minimum if ``q=0`` and the same as the maximum if - ``q=100``. + Sample percentiles, the result of ``percentile``, provide nonparametric + estimation of the underlying population, represented by the unknown + :math:`F`, given a data vector ``a`` of length ``n``. + + One type of estimators arises when one considers :math:`F` as the empirical + distribution function of the data, i.e. + :math:`F(y) = \\frac{1}{n} \\sum_i 1_{a_i \\leq y}`. + Then, different methods correspond to different choices of :math:`x` that + fulfil the above inequalities. Methods that follow this approach are + ``inverted_cdf`` and ``averaged_inverted_cdf``. + + A more general way to define sample percentile estomators is as follows. + The empirical q-percentile of ``a`` is the ``n * q/100``-th value of the + way from the minimum to the maximum in a sorted copy of ``a``. The values + and distances of the two nearest neighbors as well as the `method` + parameter will determine the percentile if the normalized ranking does not + match the location of ``n * q/100`` exactly. This function is the same as + the median if ``q=50``, the same as the minimum if ``q=0`` and the same + as the maximum if ``q=100``. The optional `method` parameter specifies the method to use when the desired percentile lies between two indexes ``i`` and ``j = i + 1``. @@ -4376,27 +4381,32 @@ def quantile(a, Notes ----- In general, the quantile at probability level math:`q` of a cumulative - probability distribution :math:`P` is defined as any number :math:`x` - that fulfills the *coverage conditions* - + distribution function :math:`F(y)=P(Y \\leq y)` with probability measure + math:`P` is defined as any number :math:`x` that fulfills the + *coverage conditions* + .. math:: P(Y < x) \\leq q \\quad\\text{and}\\quad P(Y \\leq x) \\geq q - + with random variable :math:`Y\\sim P`. - For empirical quantiles as considered here, :math:`P` is the empirical - distribution function of the given data vector ``a`` of length ``n``, - i.e. :math:`P(Y \\leq t) = \\frac{1}{n} \\sum_i 1_{a_i \\leq t}`. - Then, the different methods correspond to different choices of :math:`x` - that fulfil the above inequalities. - - To be more concrete, given a vector ``V`` of length ``n``, the empirical - q-th quantile of ``V`` is - the value ``q`` of the way from the minimum to the maximum in a - sorted copy of ``V``. The values and distances of the two nearest - neighbors as well as the `method` parameter will determine the - quantile if the normalized ranking does not match the location of - ``q`` exactly. This function is the same as the median if ``q=0.5``, the - same as the minimum if ``q=0.0`` and the same as the maximum if - ``q=1.0``. + Sample quantiles, the result of ``quantile``, provide nonparametric + estimation of the underlying population, represented by the unknown + :math:`F`, given a data vector ``a`` of length ``n``. + + One type of estimators arises when one considers :math:`F` as the empirical + distribution function of the data, i.e. + :math:`F(y) = \\frac{1}{n} \\sum_i 1_{a_i \\leq y}`. + Then, different methods correspond to different choices of :math:`x` that + fulfil the above inequalities. Methods that follow this approach are + ``inverted_cdf`` and ``averaged_inverted_cdf``. + + A more general way to define sample quantile estomators is as follows. + The empirical q-quantile of ``a`` is the ``n * q``-th value of the + way from the minimum to the maximum in a sorted copy of ``a``. The values + and distances of the two nearest neighbors as well as the `method` + parameter will determine the quantile if the normalized ranking does not + match the location of ``n * q`` exactly. This function is the same as + the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the same + as the maximumif ``q=1.0``. The optional `method` parameter specifies the method to use when the desired quantile lies between two indexes ``i`` and ``j = i + 1``. From 7fddb2126fd5ae4906391921b1f71960d5b18315 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Thu, 11 Jan 2024 22:38:39 +0100 Subject: [PATCH 23/40] CLN address some review comments like typos --- numpy/lib/_function_base_impl.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 0f8b316f8358..64190190724e 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4053,7 +4053,7 @@ def percentile(a, fulfil the above inequalities. Methods that follow this approach are ``inverted_cdf`` and ``averaged_inverted_cdf``. - A more general way to define sample percentile estomators is as follows. + A more general way to define sample percentile estimators is as follows. The empirical q-percentile of ``a`` is the ``n * q/100``-th value of the way from the minimum to the maximum in a sorted copy of ``a``. The values and distances of the two nearest neighbors as well as the `method` @@ -4259,7 +4259,8 @@ def percentile(a, if weights is not None: if method != "inverted_cdf": - raise ValueError("Only method 'inverted_cdf' supports weights.") + msg = f"Only method 'inverted_cdf' supports weights. Got: {method}." + raise ValueError(msg) weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") @@ -4399,14 +4400,14 @@ def quantile(a, fulfil the above inequalities. Methods that follow this approach are ``inverted_cdf`` and ``averaged_inverted_cdf``. - A more general way to define sample quantile estomators is as follows. + A more general way to define sample quantile estimators is as follows. The empirical q-quantile of ``a`` is the ``n * q``-th value of the way from the minimum to the maximum in a sorted copy of ``a``. The values and distances of the two nearest neighbors as well as the `method` parameter will determine the quantile if the normalized ranking does not match the location of ``n * q`` exactly. This function is the same as the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the same - as the maximumif ``q=1.0``. + as the maximum if ``q=1.0``. The optional `method` parameter specifies the method to use when the desired quantile lies between two indexes ``i`` and ``j = i + 1``. @@ -4572,7 +4573,8 @@ def quantile(a, if weights is not None: if method != "inverted_cdf": - raise ValueError("Only method 'inverted_cdf' supports weights.") + msg = f"Only method 'inverted_cdf' supports weights. Got: {method}." + raise ValueError(msg) weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") @@ -4924,7 +4926,7 @@ def find_cdf_1d(arr, cdf): i = np.searchsorted(cdf, quantiles, side="left") # We might have reached the maximum with i = len(arr), e.g. for # quantiles == 1. - i = np.minimum(i, values_count - 1) + i = min(i, values_count - 1) result = take(arr, i, axis=0, out=out) return result From a5011795263bcb6f4c0fdc849e1e8f90b728aae4 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 00:12:59 +0100 Subject: [PATCH 24/40] CLN fix line length --- numpy/lib/_function_base_impl.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 34e7db4f1590..2fc8f2b8d4ce 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4291,7 +4291,8 @@ def percentile(a, if weights is not None: if method != "inverted_cdf": - msg = f"Only method 'inverted_cdf' supports weights. Got: {method}." + msg = ("Only method 'inverted_cdf' supports weights. " + f"Got: {method}.") raise ValueError(msg) weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): @@ -4605,7 +4606,8 @@ def quantile(a, if weights is not None: if method != "inverted_cdf": - msg = f"Only method 'inverted_cdf' supports weights. Got: {method}." + msg = ("Only method 'inverted_cdf' supports weights. " + "Got: {method}.") raise ValueError(msg) weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): From b2aeeac701762b70c2223ee61c40d91621839d9a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 10:22:22 +0100 Subject: [PATCH 25/40] FIX axis before _weights_are_valid --- numpy/lib/_function_base_impl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 2fc8f2b8d4ce..624609d74ab9 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4294,6 +4294,8 @@ def percentile(a, msg = ("Only method 'inverted_cdf' supports weights. " f"Got: {method}.") raise ValueError(msg) + if axis is not None: + axis = _nx.normalize_axis_tuple(axis, a.ndim, argname="axis") weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") From af5861964e4805bc20807905cad1b3bda7e1f178 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 12:00:50 +0100 Subject: [PATCH 26/40] FIX axis in weighted quantile --- numpy/lib/_function_base_impl.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 624609d74ab9..b4b25abf0300 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -14,7 +14,7 @@ ) from numpy._core.umath import ( pi, add, arctan2, frompyfunc, cos, less_equal, sqrt, sin, - mod, exp, not_equal, subtract + mod, exp, not_equal, subtract, minimum ) from numpy._core.fromnumeric import ( ravel, nonzero, partition, mean, any, sum @@ -4609,8 +4609,10 @@ def quantile(a, if weights is not None: if method != "inverted_cdf": msg = ("Only method 'inverted_cdf' supports weights. " - "Got: {method}.") + f"Got: {method}.") raise ValueError(msg) + if axis is not None: + axis = _nx.normalize_axis_tuple(axis, a.ndim, argname="axis") weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") @@ -4959,11 +4961,11 @@ def _quantile( # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) def find_cdf_1d(arr, cdf): - i = np.searchsorted(cdf, quantiles, side="left") + indices = np.searchsorted(cdf, quantiles, side="left") # We might have reached the maximum with i = len(arr), e.g. for # quantiles == 1. - i = min(i, values_count - 1) - result = take(arr, i, axis=0, out=out) + indices = minimum(indices, values_count - 1) + result = take(arr, indices, axis=0, out=out) return result r_shape = arr.shape[1:] From 423b40124d6505f9b7590aa6abf2a792ec6b2235 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 17:32:55 +0100 Subject: [PATCH 27/40] FIX weights in _percentile_dispatcher --- numpy/lib/_function_base_impl.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index b4b25abf0300..5fb3fad6a6a3 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -3950,8 +3950,9 @@ def _median(a, axis=None, out=None, overwrite_input=False): def _percentile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, - method=None, keepdims=None, *, interpolation=None): - return (a, q, out) + method=None, keepdims=None, *, weights=None, + interpolation=None): + return (a, q, out, weights) @array_function_dispatch(_percentile_dispatcher) From 4f7841a907802cfa19d1123c526ae730ad77dafb Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 17:37:26 +0100 Subject: [PATCH 28/40] CLN replace np.s_[..., ] by (...,) --- numpy/lib/_function_base_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 5fb3fad6a6a3..5964a8945d42 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4977,7 +4977,7 @@ def find_cdf_1d(arr, cdf): # always, so we remove it here. Nk = arr.shape[1:] for kk in np.ndindex(Nk): - result[kk + np.s_[..., ]] = find_cdf_1d( + result[kk + (...,)] = find_cdf_1d( arr[np.s_[:, ] + kk], cdf[np.s_[:, ] + kk] ) From 7cc8771c196030726db465a12b78eae266674172 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 22:42:21 +0100 Subject: [PATCH 29/40] TST add weighted case to test_percentile_out --- numpy/lib/tests/test_function_base.py | 38 ++++++++++++++++++--------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 034f1abd32c8..1327bf6d2178 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3295,29 +3295,41 @@ def test_exception(self): def test_percentile_list(self): assert_equal(np.percentile([1, 2, 3], 0), 1) - def test_percentile_out(self): + @pytest.mark.parametrize( + "percentile, with_weights", + [ + (np.percentile, False), + (partial(np.percentile, method="inverted_cdf"), True), + ] + ) + def test_percentile_out(self, percentile, with_weights): + out_dtype = int if with_weights else float x = np.array([1, 2, 3]) - y = np.zeros((3,)) + y = np.zeros((3,), dtype=out_dtype) p = (1, 2, 3) - np.percentile(x, p, out=y) - assert_equal(np.percentile(x, p), y) + weights = np.ones_like(x) if with_weights else None + percentile(x, p, out=y, weights=weights) + assert_equal(percentile(x, p, weights=weights), y) x = np.array([[1, 2, 3], [4, 5, 6]]) + y = np.zeros((3, 3), dtype=out_dtype) + weights = np.ones_like(x) if with_weights else None + percentile(x, p, axis=0, out=y, weights=weights) + assert_equal(percentile(x, p, weights=weights, axis=0), y) - y = np.zeros((3, 3)) - np.percentile(x, p, axis=0, out=y) - assert_equal(np.percentile(x, p, axis=0), y) - - y = np.zeros((3, 2)) - np.percentile(x, p, axis=1, out=y) - assert_equal(np.percentile(x, p, axis=1), y) + y = np.zeros((3, 2), dtype=out_dtype) + percentile(x, p, axis=1, out=y, weights=weights) + assert_equal(percentile(x, p, weights=weights, axis=1), y) x = np.arange(12).reshape(3, 4) # q.dim > 1, float r0 = np.array([[2., 3., 4., 5.], [4., 5., 6., 7.]]) - out = np.empty((2, 4)) - assert_equal(np.percentile(x, (25, 50), axis=0, out=out), r0) + out = np.empty((2, 4), dtype=out_dtype) + weights = np.ones_like(x) if with_weights else None + assert_equal( + percentile(x, (25, 50), axis=0, out=out, weights=weights), r0 + ) assert_equal(out, r0) r1 = np.array([[0.75, 4.75, 8.75], [1.5, 5.5, 9.5]]) out = np.empty((2, 3)) From 5aa30c9dd662e5c8a2eea6ad2dbd08a2a1672cf2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Fri, 12 Jan 2024 23:00:00 +0100 Subject: [PATCH 30/40] CLN make out argument trivial but inefficient --- numpy/lib/_function_base_impl.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 5964a8945d42..19942fa27a11 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4966,7 +4966,7 @@ def find_cdf_1d(arr, cdf): # We might have reached the maximum with i = len(arr), e.g. for # quantiles == 1. indices = minimum(indices, values_count - 1) - result = take(arr, indices, axis=0, out=out) + result = take(arr, indices, axis=0) return result r_shape = arr.shape[1:] @@ -4980,6 +4980,9 @@ def find_cdf_1d(arr, cdf): result[kk + (...,)] = find_cdf_1d( arr[np.s_[:, ] + kk], cdf[np.s_[:, ] + kk] ) + # TODO: Make this more efficient!!! + if out is not None: + np.copyto(out, result) if np.any(slices_having_nans): if result.ndim == 0 and out is None: From fc0b5b7a4aaefde650285e8d74885582b31749a2 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jan 2024 12:38:15 +0100 Subject: [PATCH 31/40] FIX quantile output shape --- numpy/lib/_function_base_impl.py | 4 ++-- numpy/lib/tests/test_function_base.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 19942fa27a11..f16e51fa6418 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4971,13 +4971,13 @@ def find_cdf_1d(arr, cdf): r_shape = arr.shape[1:] if quantiles.ndim > 0: - r_shape += quantiles.shape + r_shape = quantiles.shape + r_shape result = np.empty_like(arr, shape=r_shape) # See apply_along_axis, which we do for axis=0. Note that Ni = (,) # always, so we remove it here. Nk = arr.shape[1:] for kk in np.ndindex(Nk): - result[kk + (...,)] = find_cdf_1d( + result[(...,) + kk] = find_cdf_1d( arr[np.s_[:, ] + kk], cdf[np.s_[:, ] + kk] ) # TODO: Make this more efficient!!! diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 1327bf6d2178..fb7918fa9e74 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3898,20 +3898,20 @@ def test_quantile_with_weights_and_axis(self, method): assert_allclose(q, q_res) # 1d weight and 1d alpha - alpha = [0.25, 0.5, 0.75, 1] # shape (4,) + alpha = [0, 0.2, 0.4, 0.6, 0.8, 1] # shape (6,) q = np.quantile(y, alpha, weights=w, method=method, axis=1) - q_res = np.zeros(shape=(2, 3, 4)) + q_res = np.zeros(shape=(6, 2, 3)) for i in range(2): for j in range(3): - q_res[i, j, :] = np.quantile( + q_res[:, i, j] = np.quantile( y[i, :, j], alpha, method=method, weights=w ) assert_allclose(q, q_res) # 1d weight and 2d alpha - alpha = [[0.25, 0.5], [0.75, 1]] # shape (2, 2) + alpha = [[0, 0.2], [0.4, 0.6], [0.8, 1]] # shape (3, 2) q = np.quantile(y, alpha, weights=w, method=method, axis=1) - q_res = q_res.reshape((2, 3, 2, 2)) + q_res = q_res.reshape((3, 2, 2, 3)) assert_allclose(q, q_res) # shape of weights equals shape of y From 2d96b566401dc4d61221352a1cb03693d8ef5c7a Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jan 2024 12:53:36 +0100 Subject: [PATCH 32/40] FIX & TST - cast cdf to dtype of quantile to avoid surprises - Convert scalar python objects to pure python objects as in unweighted case - Extend weighted case to test_linear_interpolation - Extend weighted case to test_percentile_out --- numpy/lib/_function_base_impl.py | 36 ++++++++++++++++++------- numpy/lib/tests/test_function_base.py | 38 ++++++++++++++++----------- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index f16e51fa6418..57107f9c2a7d 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4934,8 +4934,9 @@ def _quantile( gamma, out=out) else: - # Weighted case, we need to sort anyway. - # This implements method="inverted_cdf". + # Weighted case + # This implements method="inverted_cdf", the only supported weighted + # method, which needs to sort anyway. weights = np.asanyarray(weights) if axis != 0: weights = np.moveaxis(weights, axis, destination=0) @@ -4954,17 +4955,30 @@ def _quantile( else: # weights is 1d weights = weights.reshape(-1)[index_array, ...] - # weights = np.take_along_axis(weights, index_array, axis=axis) - # We use the weights to calculate the empirical distribution function. - cdf = weights.cumsum(axis=0, dtype=float) - cdf /= cdf[-1, ...] # normalization - # Only inverted_cdf is supported. Search index i such that - # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) + + # We use the weights to calculate the empirical cumulative + # distribution function cdf + cdf = weights.cumsum(axis=0, dtype=np.float64) + cdf /= cdf[-1, ...] # normalization to 1 + # Search index i such that + # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i) + # is then equivalent to + # cdf[i-1] < quantile <= cdf[i] + # Unfortunately, searchsorted only accepts 1-d arrays as first + # argument, so we will need to iterate over dimensions. + + # Without the following cast, searchsorted can return surprising + # results, e.g. + # np.searchsorted(np.array([0.2, 0.4, 0.6, 0.8, 1.]), + # np.array(0.4, dtype=np.float32), side="left") + # returns 2 instead of 1 because 0.4 is not binary representable. + if quantiles.dtype.kind == "f": + cdf = cdf.astype(quantiles.dtype) def find_cdf_1d(arr, cdf): indices = np.searchsorted(cdf, quantiles, side="left") # We might have reached the maximum with i = len(arr), e.g. for - # quantiles == 1. + # quantiles = 1, and need to cut it to len(arr) - 1. indices = minimum(indices, values_count - 1) result = take(arr, indices, axis=0) return result @@ -4984,6 +4998,10 @@ def find_cdf_1d(arr, cdf): if out is not None: np.copyto(out, result) + # Make result the same as in unweighted inverted_cdf. + if result.shape == () and result.dtype == np.dtype("O"): + result = result.item() + if np.any(slices_having_nans): if result.ndim == 0 and out is None: # can't write to a scalar, but indexing will be correct diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index fb7918fa9e74..c2b25c843ad1 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3113,21 +3113,23 @@ def test_linear_nan_1D(self, dtype): [(np.quantile, 0.4), (np.percentile, 40.0)]) @pytest.mark.parametrize(["input_dtype", "expected_dtype"], H_F_TYPE_CODES) - @pytest.mark.parametrize(["method", "expected"], - [("inverted_cdf", 20), - ("averaged_inverted_cdf", 27.5), - ("closest_observation", 20), - ("interpolated_inverted_cdf", 20), - ("hazen", 27.5), - ("weibull", 26), - ("linear", 29), - ("median_unbiased", 27), - ("normal_unbiased", 27.125), + @pytest.mark.parametrize(["method", "weighted", "expected"], + [("inverted_cdf", False, 20), + ("inverted_cdf", True, 20), + ("averaged_inverted_cdf", False, 27.5), + ("closest_observation", False, 20), + ("interpolated_inverted_cdf", False, 20), + ("hazen", False, 27.5), + ("weibull", False, 26), + ("linear", False, 29), + ("median_unbiased", False, 27), + ("normal_unbiased", False, 27.125), ]) def test_linear_interpolation(self, function, quantile, method, + weighted, expected, input_dtype, expected_dtype): @@ -3136,6 +3138,7 @@ def test_linear_interpolation(self, expected_dtype = np.promote_types(expected_dtype, np.float64) arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=input_dtype) + weights = np.ones_like(arr) if weighted else None if input_dtype is np.longdouble: if function is np.quantile: # 0.4 is not exactly representable and it matters @@ -3146,12 +3149,12 @@ def test_linear_interpolation(self, else: test_function = np.testing.assert_array_almost_equal_nulp - actual = function(arr, quantile, method=method) + actual = function(arr, quantile, method=method, weights=weights) test_function(actual, expected_dtype.type(expected)) if method in ["inverted_cdf", "closest_observation"]: - if input_dtype == "O": + if input_dtype == "O" : np.testing.assert_equal(np.asarray(actual).dtype, np.float64) else: np.testing.assert_equal(np.asarray(actual).dtype, @@ -3308,14 +3311,16 @@ def test_percentile_out(self, percentile, with_weights): y = np.zeros((3,), dtype=out_dtype) p = (1, 2, 3) weights = np.ones_like(x) if with_weights else None - percentile(x, p, out=y, weights=weights) + r = percentile(x, p, out=y, weights=weights) + assert r is y assert_equal(percentile(x, p, weights=weights), y) x = np.array([[1, 2, 3], [4, 5, 6]]) y = np.zeros((3, 3), dtype=out_dtype) weights = np.ones_like(x) if with_weights else None - percentile(x, p, axis=0, out=y, weights=weights) + r = percentile(x, p, axis=0, out=y, weights=weights) + assert r is y assert_equal(percentile(x, p, weights=weights, axis=0), y) y = np.zeros((3, 2), dtype=out_dtype) @@ -3324,7 +3329,10 @@ def test_percentile_out(self, percentile, with_weights): x = np.arange(12).reshape(3, 4) # q.dim > 1, float - r0 = np.array([[2., 3., 4., 5.], [4., 5., 6., 7.]]) + if with_weights: + r0 = np.array([[0, 1, 2, 3], [4, 5, 6, 7]]) + else: + r0 = np.array([[2., 3., 4., 5.], [4., 5., 6., 7.]]) out = np.empty((2, 4), dtype=out_dtype) weights = np.ones_like(x) if with_weights else None assert_equal( From 5f93d915a8223a9665c3044e52f19e9dd490a030 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jan 2024 12:58:13 +0100 Subject: [PATCH 33/40] CLN repacify linter --- numpy/lib/tests/test_function_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index c2b25c843ad1..28e22b5f8279 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3154,7 +3154,7 @@ def test_linear_interpolation(self, test_function(actual, expected_dtype.type(expected)) if method in ["inverted_cdf", "closest_observation"]: - if input_dtype == "O" : + if input_dtype == "O": np.testing.assert_equal(np.asarray(actual).dtype, np.float64) else: np.testing.assert_equal(np.asarray(actual).dtype, From 5357c7786b6f9dd3fe0d435d99789246af87e24c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jan 2024 13:04:01 +0100 Subject: [PATCH 34/40] ENH make out efficient again --- numpy/lib/_function_base_impl.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 57107f9c2a7d..08cee2071480 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4986,7 +4986,15 @@ def find_cdf_1d(arr, cdf): r_shape = arr.shape[1:] if quantiles.ndim > 0: r_shape = quantiles.shape + r_shape - result = np.empty_like(arr, shape=r_shape) + if out is None: + result = np.empty_like(arr, shape=r_shape) + else: + if out.shape != r_shape: + msg = (f"Wrong shape of argument 'out', shape={r_shape} is " + f"required; got shape={out.shape}.") + raise ValueError(msg) + result = out + # See apply_along_axis, which we do for axis=0. Note that Ni = (,) # always, so we remove it here. Nk = arr.shape[1:] @@ -4994,9 +5002,6 @@ def find_cdf_1d(arr, cdf): result[(...,) + kk] = find_cdf_1d( arr[np.s_[:, ] + kk], cdf[np.s_[:, ] + kk] ) - # TODO: Make this more efficient!!! - if out is not None: - np.copyto(out, result) # Make result the same as in unweighted inverted_cdf. if result.shape == () and result.dtype == np.dtype("O"): From 5581292ea141b01c6ffd31b716f4128e82caa945 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Sun, 14 Jan 2024 13:22:33 +0100 Subject: [PATCH 35/40] FIX nanquantile and nanpercentile --- numpy/lib/_nanfunctions_impl.py | 40 ++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/numpy/lib/_nanfunctions_impl.py b/numpy/lib/_nanfunctions_impl.py index 9cbbf88e880a..d6880daaaa14 100644 --- a/numpy/lib/_nanfunctions_impl.py +++ b/numpy/lib/_nanfunctions_impl.py @@ -23,6 +23,7 @@ import functools import warnings import numpy as np +import numpy._core.numeric as _nx from numpy.lib import _function_base_impl as fnb from numpy.lib._function_base_impl import _weights_are_valid from numpy._core import overrides @@ -1221,8 +1222,8 @@ def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=np._NoValu def _nanpercentile_dispatcher( a, q, axis=None, out=None, overwrite_input=None, - method=None, keepdims=None, *, interpolation=None): - return (a, q, out) + method=None, keepdims=None, *, weights=None, interpolation=None): + return (a, q, out, weights) @array_function_dispatch(_nanpercentile_dispatcher) @@ -1396,7 +1397,11 @@ def nanpercentile( if weights is not None: if method != "inverted_cdf": - raise ValueError("Only method 'inverted_cdf' supports weights.") + msg = ("Only method 'inverted_cdf' supports weights. " + f"Got: {method}.") + raise ValueError(msg) + if axis is not None: + axis = _nx.normalize_axis_tuple(axis, a.ndim, argname="axis") weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") @@ -1406,8 +1411,9 @@ def nanpercentile( def _nanquantile_dispatcher(a, q, axis=None, out=None, overwrite_input=None, - method=None, keepdims=None, *, interpolation=None): - return (a, q, out) + method=None, keepdims=None, *, weights=None, + interpolation=None): + return (a, q, out, weights) @array_function_dispatch(_nanquantile_dispatcher) @@ -1582,7 +1588,11 @@ def nanquantile( if weights is not None: if method != "inverted_cdf": - raise ValueError("Only method 'inverted_cdf' supports weights.") + msg = ("Only method 'inverted_cdf' supports weights. " + f"Got: {method}.") + raise ValueError(msg) + if axis is not None: + axis = _nx.normalize_axis_tuple(axis, a.ndim, argname="axis") weights = _weights_are_valid(weights=weights, a=a, axis=axis) if np.any(weights < 0): raise ValueError("Weights must be non-negative.") @@ -1609,16 +1619,23 @@ def _nanquantile_unchecked( return fnb._ureduce(a, func=_nanquantile_ureduce_func, q=q, + weights=weights, keepdims=keepdims, axis=axis, out=out, overwrite_input=overwrite_input, - method=method, - weights=weights) + method=method) -def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, - method="linear", weights=None): +def _nanquantile_ureduce_func( + a: np.array, + q: np.array, + weights: np.array, + axis: int = None, + out=None, + overwrite_input: bool = False, + method="linear", +): """ Private function that doesn't support extended axis or keepdims. These methods are extended to this function using _ureduce @@ -1626,7 +1643,8 @@ def _nanquantile_ureduce_func(a, q, axis=None, out=None, overwrite_input=False, """ if axis is None or a.ndim == 1: part = a.ravel() - result = _nanquantile_1d(part, q, overwrite_input, method, weights) + wgt = None if weights is None else weights.ravel() + result = _nanquantile_1d(part, q, overwrite_input, method, weights=wgt) else: result = np.apply_along_axis(_nanquantile_1d, axis, a, q, overwrite_input, method, weights) From 0bbe7d9149b02df06b3215181035cbac3526655c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Jan 2024 08:06:57 +0100 Subject: [PATCH 36/40] TST add tests for nanquantile and nanpercentile --- numpy/lib/tests/test_nanfunctions.py | 73 +++++++++++++++++++--------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index cfa68b037be1..5753c4d375b6 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -1,6 +1,7 @@ import warnings import pytest import inspect +from functools import partial import numpy as np from numpy._core.numeric import normalize_axis_tuple @@ -1087,21 +1088,28 @@ def test_keepdims_out(self, q, axis): assert result is out assert_equal(result.shape, shape_out) - def test_out(self): + @pytest.mark.parametrize("weighted", [False, True]) + def test_out(self, weighted): mat = np.random.rand(3, 3) nan_mat = np.insert(mat, [0, 2], np.nan, axis=1) resout = np.zeros(3) - tgt = np.percentile(mat, 42, axis=1) - res = np.nanpercentile(nan_mat, 42, axis=1, out=resout) + if weighted: + w_args = {"weights": np.ones_like(mat), "method": "inverted_cdf"} + nan_w_args = {"weights": np.ones_like(nan_mat), "method": "inverted_cdf"} + else: + w_args = dict() + nan_w_args = dict() + tgt = np.percentile(mat, 42, axis=1, **w_args) + res = np.nanpercentile(nan_mat, 42, axis=1, out=resout, **nan_w_args) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) # 0-d output: resout = np.zeros(()) - tgt = np.percentile(mat, 42, axis=None) - res = np.nanpercentile(nan_mat, 42, axis=None, out=resout) + tgt = np.percentile(mat, 42, axis=None, **w_args) + res = np.nanpercentile(nan_mat, 42, axis=None, out=resout, **nan_w_args) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) - res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout) + res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout, **nan_w_args) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) @@ -1113,13 +1121,28 @@ def test_complex(self): arr_c = np.array([0.5+3.0j, 2.1+0.5j, 1.6+2.3j], dtype='F') assert_raises(TypeError, np.nanpercentile, arr_c, 0.5) - def test_result_values(self): - tgt = [np.percentile(d, 28) for d in _rdat] - res = np.nanpercentile(_ndat, 28, axis=1) + @pytest.mark.parametrize("weighted", [False, True]) + def test_result_values(self, weighted): + if weighted: + percentile = partial(np.percentile, method="inverted_cdf") + nanpercentile = partial(np.nanpercentile, method="inverted_cdf") + + def gen_weights(d): + return np.ones_like(d) + + else: + percentile = np.percentile + nanpercentile = np.nanpercentile + + def gen_weights(d): + return None + + tgt = [percentile(d, 28, weights=gen_weights(d)) for d in _rdat] + res = nanpercentile(_ndat, 28, axis=1, weights=gen_weights(_ndat)) assert_almost_equal(res, tgt) # Transpose the array to fit the output convention of numpy.percentile - tgt = np.transpose([np.percentile(d, (28, 98)) for d in _rdat]) - res = np.nanpercentile(_ndat, (28, 98), axis=1) + tgt = np.transpose([percentile(d, (28, 98), weights=gen_weights(d)) for d in _rdat]) + res = nanpercentile(_ndat, (28, 98), axis=1, weights=gen_weights(_ndat)) assert_almost_equal(res, tgt) @pytest.mark.parametrize("axis", [None, 0, 1]) @@ -1197,19 +1220,25 @@ def test_multiple_percentiles(self): class TestNanFunctions_Quantile: # most of this is already tested by TestPercentile - def test_regression(self): + @pytest.mark.parametrize("weighted", [False, True]) + def test_regression(self, weighted): ar = np.arange(24).reshape(2, 3, 4).astype(float) ar[0][1] = np.nan - - assert_equal(np.nanquantile(ar, q=0.5), np.nanpercentile(ar, q=50)) - assert_equal(np.nanquantile(ar, q=0.5, axis=0), - np.nanpercentile(ar, q=50, axis=0)) - assert_equal(np.nanquantile(ar, q=0.5, axis=1), - np.nanpercentile(ar, q=50, axis=1)) - assert_equal(np.nanquantile(ar, q=[0.5], axis=1), - np.nanpercentile(ar, q=[50], axis=1)) - assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1), - np.nanpercentile(ar, q=[25, 50, 75], axis=1)) + if weighted: + w_args = {"weights": np.ones_like(ar), "method": "inverted_cdf"} + else: + w_args = dict() + + assert_equal(np.nanquantile(ar, q=0.5, **w_args), + np.nanpercentile(ar, q=50, **w_args)) + assert_equal(np.nanquantile(ar, q=0.5, axis=0, **w_args), + np.nanpercentile(ar, q=50, axis=0, **w_args)) + assert_equal(np.nanquantile(ar, q=0.5, axis=1, **w_args), + np.nanpercentile(ar, q=50, axis=1, **w_args)) + assert_equal(np.nanquantile(ar, q=[0.5], axis=1, **w_args), + np.nanpercentile(ar, q=[50], axis=1, **w_args)) + assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1, **w_args), + np.nanpercentile(ar, q=[25, 50, 75], axis=1, **w_args)) def test_basic(self): x = np.arange(8) * 0.5 From 63c3536e5de8059e12bb4c2ace667e0f9a706d8c Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Jan 2024 11:26:40 +0100 Subject: [PATCH 37/40] FIX nan handling with slices_having_nans --- numpy/lib/_function_base_impl.py | 30 ++++++++++++------------ numpy/lib/tests/test_function_base.py | 33 ++++++++++++++++++--------- numpy/lib/tests/test_nanfunctions.py | 16 +++++++++---- 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index 08cee2071480..ee31b090599e 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4874,14 +4874,17 @@ def _quantile( values_count = arr.shape[axis] # The dimensions of `q` are prepended to the output shape, so we need the # axis being sampled from `arr` to be last. - if axis != 0: # But moveaxis is slow, so only call it if necessary. arr = np.moveaxis(arr, axis, destination=0) - # --- Computation of indexes - # Index where to find the value in the sorted array. - # Virtual because it is a floating point value, not an valid index. - # The nearest neighbours are used for interpolation + supports_nans = ( + np.issubdtype(arr.dtype, np.inexact) or arr.dtype.kind in 'Mm' + ) + if weights is None: + # --- Computation of indexes + # Index where to find the value in the sorted array. + # Virtual because it is a floating point value, not an valid index. + # The nearest neighbours are used for interpolation try: method = _QuantileMethods[method] except KeyError: @@ -4891,9 +4894,6 @@ def _quantile( virtual_indexes = method["get_virtual_index"](values_count, quantiles) virtual_indexes = np.asanyarray(virtual_indexes) - supports_nans = ( - np.issubdtype(arr.dtype, np.inexact) or arr.dtype.kind in 'Mm') - if np.issubdtype(virtual_indexes.dtype, np.integer): # No interpolation needed, take the points along axis if supports_nans: @@ -4941,11 +4941,6 @@ def _quantile( if axis != 0: weights = np.moveaxis(weights, axis, destination=0) index_array = np.argsort(arr, axis=0, kind="stable") - # TODO: Which value to set to slices_having_nans. - slices_having_nans = None - # TODO: Deal with nan values, e.g. np.isnan(arr(index_array[-1])) by - # be true. - # TODO: Special case quantiles == 0 and quantiles == 1? # arr = arr[index_array, ...] # but this adds trailing dimensions of # 1. @@ -4955,7 +4950,14 @@ def _quantile( else: # weights is 1d weights = weights.reshape(-1)[index_array, ...] - + + if supports_nans: + # may contain nan, which would sort to the end + slices_having_nans = np.isnan(arr[-1, ...]) + else: + # cannot contain nan + slices_having_nans = np.array(False, dtype=bool) + # We use the weights to calculate the empirical cumulative # distribution function cdf cdf = weights.cumsum(axis=0, dtype=np.float64) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 28e22b5f8279..4d9861434958 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3047,6 +3047,11 @@ def test_basic(self): x[1] = np.nan assert_equal(np.percentile(x, 0), np.nan) assert_equal(np.percentile(x, 0, method='nearest'), np.nan) + assert_equal(np.percentile(x, 0, method='inverted_cdf'), np.nan) + assert_equal( + np.percentile(x, 0, method='inverted_cdf', weights=np.ones_like(x)), + np.nan, + ) def test_fraction(self): x = [Fraction(i, 2) for i in range(8)] @@ -3124,7 +3129,7 @@ def test_linear_nan_1D(self, dtype): ("linear", False, 29), ("median_unbiased", False, 27), ("normal_unbiased", False, 27.125), - ]) + ]) def test_linear_interpolation(self, function, quantile, @@ -3511,23 +3516,29 @@ def test_out(self): assert_equal(np.percentile(d, 2, out=o), o) assert_equal(np.percentile(d, 2, method='nearest', out=o), o) - def test_out_nan(self): + @pytest.mark.parametrize("method, weighted", [ + ("linear", False), + ("nearest", False), + ("inverted_cdf", False), + ("inverted_cdf", True), + ]) + def test_out_nan(self, method, weighted): + if weighted: + kwargs = {"weights": np.ones((3, 4)), "method": method} + else: + kwargs = {"method": method} with warnings.catch_warnings(record=True): warnings.filterwarnings('always', '', RuntimeWarning) o = np.zeros((4,)) d = np.ones((3, 4)) d[2, 1] = np.nan - assert_equal(np.percentile(d, 0, 0, out=o), o) - assert_equal( - np.percentile(d, 0, 0, method='nearest', out=o), o) + assert_equal(np.percentile(d, 0, 0, out=o, **kwargs), o) + o = np.zeros((3,)) - assert_equal(np.percentile(d, 1, 1, out=o), o) - assert_equal( - np.percentile(d, 1, 1, method='nearest', out=o), o) + assert_equal(np.percentile(d, 1, 1, out=o, **kwargs), o) + o = np.zeros(()) - assert_equal(np.percentile(d, 1, out=o), o) - assert_equal( - np.percentile(d, 1, method='nearest', out=o), o) + assert_equal(np.percentile(d, 1, out=o, **kwargs), o) def test_nan_behavior(self): a = np.arange(24, dtype=float) diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index 5753c4d375b6..1526111d2145 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -1095,7 +1095,9 @@ def test_out(self, weighted): resout = np.zeros(3) if weighted: w_args = {"weights": np.ones_like(mat), "method": "inverted_cdf"} - nan_w_args = {"weights": np.ones_like(nan_mat), "method": "inverted_cdf"} + nan_w_args = { + "weights": np.ones_like(nan_mat), "method": "inverted_cdf" + } else: w_args = dict() nan_w_args = dict() @@ -1106,10 +1108,14 @@ def test_out(self, weighted): # 0-d output: resout = np.zeros(()) tgt = np.percentile(mat, 42, axis=None, **w_args) - res = np.nanpercentile(nan_mat, 42, axis=None, out=resout, **nan_w_args) + res = np.nanpercentile( + nan_mat, 42, axis=None, out=resout, **nan_w_args + ) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) - res = np.nanpercentile(nan_mat, 42, axis=(0, 1), out=resout, **nan_w_args) + res = np.nanpercentile( + nan_mat, 42, axis=(0, 1), out=resout, **nan_w_args + ) assert_almost_equal(res, resout) assert_almost_equal(res, tgt) @@ -1214,7 +1220,9 @@ def test_multiple_percentiles(self): assert_equal(nan_val, val) megamat = np.ones((3, 4, 5, 6)) - assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6)) + assert_equal( + np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6) + ) class TestNanFunctions_Quantile: From 798845f44eb26bfb6c365e8f1e27403d9ff45931 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Jan 2024 11:29:00 +0100 Subject: [PATCH 38/40] CLN fix BE/AE inconsistency --- numpy/lib/_function_base_impl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py index ee31b090599e..c9a52a07e8f7 100644 --- a/numpy/lib/_function_base_impl.py +++ b/numpy/lib/_function_base_impl.py @@ -4083,7 +4083,7 @@ def percentile(a, distribution function of the data, i.e. :math:`F(y) = \\frac{1}{n} \\sum_i 1_{a_i \\leq y}`. Then, different methods correspond to different choices of :math:`x` that - fulfil the above inequalities. Methods that follow this approach are + fulfill the above inequalities. Methods that follow this approach are ``inverted_cdf`` and ``averaged_inverted_cdf``. A more general way to define sample percentile estimators is as follows. @@ -4433,7 +4433,7 @@ def quantile(a, distribution function of the data, i.e. :math:`F(y) = \\frac{1}{n} \\sum_i 1_{a_i \\leq y}`. Then, different methods correspond to different choices of :math:`x` that - fulfil the above inequalities. Methods that follow this approach are + fulfill the above inequalities. Methods that follow this approach are ``inverted_cdf`` and ``averaged_inverted_cdf``. A more general way to define sample quantile estimators is as follows. From 01b6ad4b6942f1fa0868dfdc7933610549dda34e Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Jan 2024 12:26:04 +0100 Subject: [PATCH 39/40] CLN tame the linter --- numpy/lib/tests/test_function_base.py | 3 ++- numpy/lib/tests/test_nanfunctions.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py index 4d9861434958..27fdb10c1a02 100644 --- a/numpy/lib/tests/test_function_base.py +++ b/numpy/lib/tests/test_function_base.py @@ -3049,7 +3049,8 @@ def test_basic(self): assert_equal(np.percentile(x, 0, method='nearest'), np.nan) assert_equal(np.percentile(x, 0, method='inverted_cdf'), np.nan) assert_equal( - np.percentile(x, 0, method='inverted_cdf', weights=np.ones_like(x)), + np.percentile(x, 0, method='inverted_cdf', + weights=np.ones_like(x)), np.nan, ) diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index 1526111d2145..a57b26557ae9 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -1147,7 +1147,8 @@ def gen_weights(d): res = nanpercentile(_ndat, 28, axis=1, weights=gen_weights(_ndat)) assert_almost_equal(res, tgt) # Transpose the array to fit the output convention of numpy.percentile - tgt = np.transpose([percentile(d, (28, 98), weights=gen_weights(d)) for d in _rdat]) + tgt = np.transpose([percentile(d, (28, 98), weights=gen_weights(d)) + for d in _rdat]) res = nanpercentile(_ndat, (28, 98), axis=1, weights=gen_weights(_ndat)) assert_almost_equal(res, tgt) From bbdd595dcfca968637edd7aa6eb94ed73e4968d8 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 15 Jan 2024 12:30:43 +0100 Subject: [PATCH 40/40] CLN linter again --- numpy/lib/tests/test_nanfunctions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py index a57b26557ae9..f77e2c0aac74 100644 --- a/numpy/lib/tests/test_nanfunctions.py +++ b/numpy/lib/tests/test_nanfunctions.py @@ -1149,7 +1149,8 @@ def gen_weights(d): # Transpose the array to fit the output convention of numpy.percentile tgt = np.transpose([percentile(d, (28, 98), weights=gen_weights(d)) for d in _rdat]) - res = nanpercentile(_ndat, (28, 98), axis=1, weights=gen_weights(_ndat)) + res = nanpercentile(_ndat, (28, 98), axis=1, + weights=gen_weights(_ndat)) assert_almost_equal(res, tgt) @pytest.mark.parametrize("axis", [None, 0, 1])