138
138
from itertools import count , groupby , repeat
139
139
from bisect import bisect_left , bisect_right
140
140
from math import hypot , sqrt , fabs , exp , erf , tau , log , fsum , sumprod
141
- from math import isfinite , isinf , pi , cos , cosh
141
+ from math import isfinite , isinf , pi , cos , sin , cosh , atan
142
142
from functools import reduce
143
143
from operator import itemgetter
144
144
from collections import Counter , namedtuple , defaultdict
@@ -803,9 +803,9 @@ def multimode(data):
803
803
return [value for value , count in counts .items () if count == maxcount ]
804
804
805
805
806
- def kde (data , h , kernel = 'normal' ):
807
- """Kernel Density Estimation: Create a continuous probability
808
- density function from discrete samples.
806
+ def kde (data , h , kernel = 'normal' , * , cumulative = False ):
807
+ """Kernel Density Estimation: Create a continuous probability density
808
+ function or cumulative distribution function from discrete samples.
809
809
810
810
The basic idea is to smooth the data using a kernel function
811
811
to help draw inferences about a population from a sample.
@@ -820,20 +820,22 @@ def kde(data, h, kernel='normal'):
820
820
821
821
Kernels that give some weight to every sample point:
822
822
823
- normal or gauss
823
+ normal ( gauss)
824
824
logistic
825
825
sigmoid
826
826
827
827
Kernels that only give weight to sample points within
828
828
the bandwidth:
829
829
830
- rectangular or uniform
830
+ rectangular ( uniform)
831
831
triangular
832
- parabolic or epanechnikov
833
- quartic or biweight
832
+ parabolic ( epanechnikov)
833
+ quartic ( biweight)
834
834
triweight
835
835
cosine
836
836
837
+ If *cumulative* is true, will return a cumulative distribution function.
838
+
837
839
A StatisticsError will be raised if the data sequence is empty.
838
840
839
841
Example
@@ -847,7 +849,8 @@ def kde(data, h, kernel='normal'):
847
849
848
850
Compute the area under the curve:
849
851
850
- >>> sum(f_hat(x) for x in range(-20, 20))
852
+ >>> area = sum(f_hat(x) for x in range(-20, 20))
853
+ >>> round(area, 4)
851
854
1.0
852
855
853
856
Plot the estimated probability density function at
@@ -876,6 +879,13 @@ def kde(data, h, kernel='normal'):
876
879
9: 0.009 x
877
880
10: 0.002 x
878
881
882
+ Estimate P(4.5 < X <= 7.5), the probability that a new sample value
883
+ will be between 4.5 and 7.5:
884
+
885
+ >>> cdf = kde(sample, h=1.5, cumulative=True)
886
+ >>> round(cdf(7.5) - cdf(4.5), 2)
887
+ 0.22
888
+
879
889
References
880
890
----------
881
891
@@ -888,6 +898,9 @@ def kde(data, h, kernel='normal'):
888
898
Interactive graphical demonstration and exploration:
889
899
https://demonstrations.wolfram.com/KernelDensityEstimation/
890
900
901
+ Kernel estimation of cumulative distribution function of a random variable with bounded support
902
+ https://www.econstor.eu/bitstream/10419/207829/1/10.21307_stattrans-2016-037.pdf
903
+
891
904
"""
892
905
893
906
n = len (data )
@@ -903,45 +916,56 @@ def kde(data, h, kernel='normal'):
903
916
match kernel :
904
917
905
918
case 'normal' | 'gauss' :
906
- c = 1 / sqrt (2 * pi )
907
- K = lambda t : c * exp (- 1 / 2 * t * t )
919
+ sqrt2pi = sqrt (2 * pi )
920
+ sqrt2 = sqrt (2 )
921
+ K = lambda t : exp (- 1 / 2 * t * t ) / sqrt2pi
922
+ I = lambda t : 1 / 2 * (1.0 + erf (t / sqrt2 ))
908
923
support = None
909
924
910
925
case 'logistic' :
911
926
# 1.0 / (exp(t) + 2.0 + exp(-t))
912
927
K = lambda t : 1 / 2 / (1.0 + cosh (t ))
928
+ I = lambda t : 1.0 - 1.0 / (exp (t ) + 1.0 )
913
929
support = None
914
930
915
931
case 'sigmoid' :
916
932
# (2/pi) / (exp(t) + exp(-t))
917
- c = 1 / pi
918
- K = lambda t : c / cosh (t )
933
+ c1 = 1 / pi
934
+ c2 = 2 / pi
935
+ K = lambda t : c1 / cosh (t )
936
+ I = lambda t : c2 * atan (exp (t ))
919
937
support = None
920
938
921
939
case 'rectangular' | 'uniform' :
922
940
K = lambda t : 1 / 2
941
+ I = lambda t : 1 / 2 * t + 1 / 2
923
942
support = 1.0
924
943
925
944
case 'triangular' :
926
945
K = lambda t : 1.0 - abs (t )
946
+ I = lambda t : t * t * (1 / 2 if t < 0.0 else - 1 / 2 ) + t + 1 / 2
927
947
support = 1.0
928
948
929
949
case 'parabolic' | 'epanechnikov' :
930
950
K = lambda t : 3 / 4 * (1.0 - t * t )
951
+ I = lambda t : - 1 / 4 * t ** 3 + 3 / 4 * t + 1 / 2
931
952
support = 1.0
932
953
933
954
case 'quartic' | 'biweight' :
934
955
K = lambda t : 15 / 16 * (1.0 - t * t ) ** 2
956
+ I = lambda t : 3 / 16 * t ** 5 - 5 / 8 * t ** 3 + 15 / 16 * t + 1 / 2
935
957
support = 1.0
936
958
937
959
case 'triweight' :
938
960
K = lambda t : 35 / 32 * (1.0 - t * t ) ** 3
961
+ I = lambda t : 35 / 32 * (- 1 / 7 * t ** 7 + 3 / 5 * t ** 5 - t ** 3 + t ) + 1 / 2
939
962
support = 1.0
940
963
941
964
case 'cosine' :
942
965
c1 = pi / 4
943
966
c2 = pi / 2
944
967
K = lambda t : c1 * cos (c2 * t )
968
+ I = lambda t : 1 / 2 * sin (c2 * t ) + 1 / 2
945
969
support = 1.0
946
970
947
971
case _:
@@ -952,6 +976,9 @@ def kde(data, h, kernel='normal'):
952
976
def pdf (x ):
953
977
return sum (K ((x - x_i ) / h ) for x_i in data ) / (n * h )
954
978
979
+ def cdf (x ):
980
+ return sum (I ((x - x_i ) / h ) for x_i in data ) / n
981
+
955
982
else :
956
983
957
984
sample = sorted (data )
@@ -963,9 +990,19 @@ def pdf(x):
963
990
supported = sample [i : j ]
964
991
return sum (K ((x - x_i ) / h ) for x_i in supported ) / (n * h )
965
992
966
- pdf .__doc__ = f'PDF estimate with { h = !r} and { kernel = !r} '
993
+ def cdf (x ):
994
+ i = bisect_left (sample , x - bandwidth )
995
+ j = bisect_right (sample , x + bandwidth )
996
+ supported = sample [i : j ]
997
+ return sum ((I ((x - x_i ) / h ) for x_i in supported ), i ) / n
967
998
968
- return pdf
999
+ if cumulative :
1000
+ cdf .__doc__ = f'CDF estimate with { h = !r} and { kernel = !r} '
1001
+ return cdf
1002
+
1003
+ else :
1004
+ pdf .__doc__ = f'PDF estimate with { h = !r} and { kernel = !r} '
1005
+ return pdf
969
1006
970
1007
971
1008
# Notes on methods for computing quantiles
0 commit comments