From 6dedffde926b636fc8bb8089f15117f3d13f51d7 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 2 Jun 2019 20:43:45 -0700 Subject: [PATCH 1/2] bpo-36546: Add design notes to aid future discussions --- Lib/statistics.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/Lib/statistics.py b/Lib/statistics.py index 19db8e828010134..194212084f58e60 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -564,6 +564,45 @@ def multimode(data): maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, [])) return list(map(itemgetter(0), mode_items)) +# Notes on methods for computing quantiles +# ---------------------------------------- +# +# There is no one perfect way to compute quantiles. Here we offer +# two methods that serve common needs. Most other packages +# surveyed offered at least one or both of these two, making them +# "standard" in the sense of "widely-adopted and reproducible". +# They are also easy to explain, easy to compute manually, and have +# straight-forward interpretations that aren't surprising. + +# The default method is known as "R6", "PERCENTILE.EXC", or "expected +# value of rank order statistics". The alternative method is known as +# "R7", "PERCENTILE.INC", or "mode of rank order statistics". + +# For sample data where there is a positive probability for values +# beyond the range of the data, the R6 exclusive method is a +# reasonable choice. Consider a random sample of nine values from a +# population with a uniform distribution from 0.0 to 100.0. The +# distribution of the third ranked sample point is described by +# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and +# mean=0.300. Only the latter (which corresponds with R6) gives the +# desired cut point with 30% of the population falling below that +# value, making it comparable to a result from an inv_cdf() function. + +# For describing population data where the end points are known to +# be included in the data, the R7 inclusive method is a reasonable +# choice. Instead of the mean, it uses the mode of beta +# distribution for the interior points. Per Hyndman & Fan, "One nice +# property is that the vertices of Q7(p) divide the range into n - 1 +# intervals, and exactly 100p% of the intervals lie to the left of +# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)." + +# If the need arises, we could add method="median" for a median +# unbiased, distribution-free alternative. Also if needed, the +# distribution-free approaches could be augmented by adding +# method='normal'. However, for now, the position is that fewer +# options make for easier choices and than external packages can be +# used for anything more advanced. + def quantiles(dist, *, n=4, method='exclusive'): '''Divide *dist* into *n* continuous intervals with equal probability. From 613e2e3e50d43d5c9b7798508035ee1f9f19f3b5 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Sun, 2 Jun 2019 20:48:47 -0700 Subject: [PATCH 2/2] Fix two typos --- Lib/statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/statistics.py b/Lib/statistics.py index 194212084f58e60..012845b8d2ef4c3 100644 --- a/Lib/statistics.py +++ b/Lib/statistics.py @@ -590,7 +590,7 @@ def multimode(data): # For describing population data where the end points are known to # be included in the data, the R7 inclusive method is a reasonable -# choice. Instead of the mean, it uses the mode of beta +# choice. Instead of the mean, it uses the mode of the beta # distribution for the interior points. Per Hyndman & Fan, "One nice # property is that the vertices of Q7(p) divide the range into n - 1 # intervals, and exactly 100p% of the intervals lie to the left of @@ -600,7 +600,7 @@ def multimode(data): # unbiased, distribution-free alternative. Also if needed, the # distribution-free approaches could be augmented by adding # method='normal'. However, for now, the position is that fewer -# options make for easier choices and than external packages can be +# options make for easier choices and that external packages can be # used for anything more advanced. def quantiles(dist, *, n=4, method='exclusive'):