Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

[MRG] Add interpolation to _weighted_percentile (Addresses #6189) #7662

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 68 additions & 9 deletions 77 sklearn/utils/stats.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,77 @@
import numpy as np
"""Statistical utilities including weighted percentile"""

from .extmath import stable_cumsum
import numpy as np
from sklearn.utils.extmath import stable_cumsum


def _weighted_percentile(array, sample_weight, percentile=50):
"""Compute the weighted ``percentile`` of ``array``
with ``sample_weight``.

This approach follows

N
S_N = sum w_k
k=1

p_n = 1 / S_N * (x_n - w_n / 2)

v = v_k + (v_{k + 1} - v_k) * (P - p_k) / (p_{k + 1} - p_k)

from
https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method.


Parameters
----------
array : array-like, shape = (n_samples,)
Array of data on which to calculate the weighted percentile

sample_weight : array-like, shape = (n_samples,)
Array of corresponding sample weights with which to calculate
the weighted percentile

percentile : int, optional (default: 50)
Integer value of Pth percentile to compute

Returns
-------
v : float
Linearly interpolated weighted percentile.

Examples
--------
>>> import numpy as np
>>> from sklearn.utils.stats import _weighted_percentile
>>> weight = np.array([1, 1])
>>> data = np.array([0, 1])
>>> _weighted_percentile(data, weight, percentile=0)
0.0
>>> _weighted_percentile(data, weight, percentile=50)
0.5
>>> _weighted_percentile(data, weight, percentile=90)
1.0
"""
Compute the weighted ``percentile`` of ``array`` with ``sample_weight``.
"""
if not isinstance(array, np.ndarray):
array = np.array(array)

if not isinstance(sample_weight, np.ndarray):
sample_weight = np.array(sample_weight)

if (sample_weight < 0).any():
raise ValueError("sample_weight must contain positive or 0 weights")

if percentile < 0:
raise ValueError("percentile must be positive or 0")

sorted_idx = np.argsort(array)
sorted_array = array[sorted_idx]

# if there are no weights, return the min of ``array``
if sample_weight.sum() == 0:
return sorted_array[0]

# Find index of median prediction for each sample
weight_cdf = stable_cumsum(sample_weight[sorted_idx])
percentile_idx = np.searchsorted(
weight_cdf, (percentile / 100.) * weight_cdf[-1])
# in rare cases, percentile_idx equals to len(sorted_idx)
percentile_idx = np.clip(percentile_idx, 0, len(sorted_idx)-1)
return array[sorted_idx[percentile_idx]]
p_n = 100. / weight_cdf[-1] * (weight_cdf - sample_weight / 2.)
return np.interp(percentile, p_n, sorted_array)
98 changes: 98 additions & 0 deletions 98 sklearn/utils/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import numpy as np
from sklearn.utils.testing import assert_equal, assert_raises
from sklearn.utils.stats import _weighted_percentile


def test_weighted_percentile_negative_weights_raises():
weight = np.array([1, -1])
data = np.array([0, 1])
assert_raises(ValueError, _weighted_percentile, data, weight)


def test_weighted_percentile_negative_percentile_raises():
weight = np.array([1, -1])
data = np.array([0, 1])
percentile = -50
assert_raises(ValueError, _weighted_percentile, data, weight,
percentile=percentile)


def test_weighted_percentile_no_weights():
weight = np.array([0, 0])
data = np.array([0, 1])
percentile = 50
expected = 0
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_median_interpolated_list():
weight = [1, 1]
data = [0, 1]
percentile = 50
expected = 0.5
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_median_interpolated_tuple():
weight = (1, 1)
data = (0, 1)
percentile = 50
expected = 0.5
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_median_interpolated():
weight = np.array([1, 1])
data = np.array([0, 1])
percentile = 50
expected = 0.5
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_median_regular():
weight = np.array([1, 1, 1])
data = np.array([0, 1, 2])
percentile = 50
expected = 1.0
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_0_regular():
weight = np.array([1, 1, 1])
data = np.array([0, 1, 2])
percentile = 0
expected = 0.0
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_90_regular():
weight = np.array([1, 1, 1])
data = np.array([0, 1, 2])
percentile = 90
expected = 2.0
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_70_interpolated():
weight = np.array([1, 1, 1, 1])
data = np.arange(0, 4, 1)
percentile = 70
expected = 2.3
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)


def test_weighted_percentile_70_mixed_weights():
weight = np.array([1, 0, 1, 1])
data = np.arange(0, 4, 1)
percentile = 50
expected = 2.0
actual = _weighted_percentile(data, weight, percentile=percentile)
assert_equal(expected, actual)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.