@@ -3672,12 +3672,12 @@ def stineman_interp(xi,x,y,yp=None):
3672
3672
1 / (dy1 + dy2 ),))
3673
3673
return yi
3674
3674
3675
- def ksdensity ( dataset , bw_method = None ):
3675
+ class gaussian_kde ( object ):
3676
3676
"""
3677
3677
Representation of a kernel-density estimate using Gaussian kernels.
3678
3678
3679
3679
Call signature::
3680
- kde_dict = ksdensity (dataset, 'silverman')
3680
+ kde = gaussian_kde (dataset, 'silverman')
3681
3681
3682
3682
Parameters
3683
3683
----------
@@ -3692,10 +3692,10 @@ def ksdensity(dataset, bw_method=None):
3692
3692
Attributes
3693
3693
----------
3694
3694
dataset : ndarray
3695
- The dataset with which `ksdensity ` was initialized.
3696
- d : int
3695
+ The dataset with which `gaussian_kde ` was initialized.
3696
+ dim : int
3697
3697
Number of dimensions.
3698
- n : int
3698
+ num_dp : int
3699
3699
Number of datapoints.
3700
3700
factor : float
3701
3701
The bandwidth factor, obtained from `kde.covariance_factor`, with which
@@ -3706,117 +3706,135 @@ def ksdensity(dataset, bw_method=None):
3706
3706
inv_cov : ndarray
3707
3707
The inverse of `covariance`.
3708
3708
3709
- Returns
3709
+ Methods
3710
3710
-------
3711
- A dictionary mapping each various aspects of the computed KDE.
3712
- The dictionary has the following keys:
3713
-
3714
- xmin : number
3715
- The min of the input dataset
3716
- xmax : number
3717
- The max of the input dataset
3718
- mean : number
3719
- The mean of the result
3720
- median: number
3721
- The median of the result
3722
- result: (# of points,)-array
3723
- The array of the evaluated PDF estimation
3724
-
3725
- Raises
3726
- ------
3727
- ValueError : if the dimensionality of the input points is different than
3728
- the dimensionality of the KDE.
3711
+ kde.evaluate(points) : ndarray
3712
+ Evaluate the estimated pdf on a provided set of points.
3713
+ kde(points) : ndarray
3714
+ Same as kde.evaluate(points)
3715
+ kde.set_bandwidth(bw_method='scott') : None
3716
+ Computes the bandwidth, i.e. the coefficient that multiplies the data
3717
+ covariance matrix to obtain the kernel covariance matrix.
3718
+ .. versionadded:: 0.11.0
3719
+ kde.covariance_factor : float
3720
+ Computes the coefficient (`kde.factor`) that multiplies the data
3721
+ covariance matrix to obtain the kernel covariance matrix.
3722
+ The default is `scotts_factor`. A subclass can overwrite this method
3723
+ to provide a different method, or set it through a call to
3724
+ `kde.set_bandwidth`.
3729
3725
3730
3726
"""
3731
3727
3732
3728
# This implementation with minor modification was too good to pass up.
3733
3729
# from scipy: https://github.com/scipy/scipy/blob/master/scipy/stats/kde.py
3734
3730
3735
- dataset = np .array (np .atleast_2d (dataset ))
3736
- xmin = dataset .min ()
3737
- xmax = dataset .max ()
3731
+ def __init__ (self , dataset , bw_method = None ):
3732
+ self .dataset = np .atleast_2d (dataset )
3733
+ if not self .dataset .size > 1 :
3734
+ raise ValueError ("`dataset` input should have multiple elements." )
3738
3735
3739
- if not dataset . size > 1 :
3740
- raise ValueError ( "`dataset` input should have multiple elements." )
3736
+ self . dim , self . num_dp = self . dataset . shape
3737
+ self . set_bandwidth ( bw_method = bw_method )
3741
3738
3742
- dim , num_dp = dataset .shape
3739
+ def scotts_factor (self ):
3740
+ return np .power (self .num_dp , - 1. / (self .dim + 4 ))
3743
3741
3744
- # ----------------------------------------------
3745
- # Set Bandwidth, defaulted to Scott's Factor
3746
- # ----------------------------------------------
3747
- scotts_factor = lambda : np .power (num_dp , - 1. / (dim + 4 ))
3748
- silverman_factor = lambda : np .power (num_dp * (dim + 2.0 )/ 4.0 , - 1. / (dim + 4 ))
3742
+ def silverman_factor (self ):
3743
+ return np .power (self .num_dp * (self .dim + 2.0 )/ 4.0 , - 1. / (self .dim + 4 ))
3749
3744
3750
- # Default method to calculate bandwidth, can be overwritten by subclass
3745
+ # Default method to calculate bandwidth, can be overwritten by subclass
3751
3746
covariance_factor = scotts_factor
3752
3747
3753
- if bw_method is None :
3754
- pass
3755
- elif bw_method == 'scott' :
3756
- covariance_factor = scotts_factor
3757
- elif bw_method == 'silverman' :
3758
- covariance_factor = silverman_factor
3759
- elif np .isscalar (bw_method ) and not isinstance (bw_method , six .string_types ):
3760
- covariance_factor = lambda : bw_method
3761
- else :
3762
- msg = "`bw_method` should be 'scott', 'silverman', or a scalar"
3763
- raise ValueError (msg )
3764
-
3765
- # ---------------------------------------------------------------
3766
- # Computes covariance matrix for each Gaussian kernel with factor
3767
- # ---------------------------------------------------------------
3768
- factor = covariance_factor ()
3769
-
3770
- # Cache covariance and inverse covariance of the data
3771
- data_covariance = np .atleast_2d (np .cov (dataset , rowvar = 1 , bias = False ))
3772
- data_inv_cov = np .linalg .inv (data_covariance )
3773
-
3774
- covariance = data_covariance * factor ** 2
3775
- inv_cov = data_inv_cov / factor ** 2
3776
- norm_factor = np .sqrt (np .linalg .det (2 * np .pi * covariance )) * num_dp
3777
-
3778
- # ----------------------------------------------
3779
- # Evaluate the estimated pdf on a set of points.
3780
- # ----------------------------------------------
3781
- points = np .atleast_2d (np .arange (xmin , xmax , (xmax - xmin )/ 100. ))
3782
-
3783
- dim_pts , num_dp_pts = np .array (points ).shape
3784
- if dim_pts != dim :
3785
- if dim_pts == 1 and num_dp_pts == num_dp :
3786
- # points was passed in as a row vector
3787
- points = np .reshape (points , (dim , 1 ))
3788
- num_dp_pts = 1
3748
+ def set_bandwidth (self , bw_method = None ):
3749
+ if bw_method is None :
3750
+ pass
3751
+ elif bw_method == 'scott' :
3752
+ self .covariance_factor = self .scotts_factor
3753
+ elif bw_method == 'silverman' :
3754
+ self .covariance_factor = self .silverman_factor
3755
+ elif np .isscalar (bw_method ) and not isinstance (bw_method , six .string_types ):
3756
+ self ._bw_method = 'use constant'
3757
+ self .covariance_factor = lambda : bw_method
3758
+ elif callable (bw_method ):
3759
+ self ._bw_method = bw_method
3760
+ self .covariance_factor = lambda : self ._bw_method (self )
3789
3761
else :
3790
- msg = "points have dimension %s, \
3791
- dataset has dimension %s" % ( dim_pts , dim )
3762
+ msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
3763
+ "or a callable."
3792
3764
raise ValueError (msg )
3793
3765
3794
- result = np . zeros (( num_dp_pts ,), dtype = np . float )
3766
+ self . _compute_covariance ( )
3795
3767
3796
- if num_dp_pts >= num_dp :
3797
- # there are more points than data, so loop over data
3798
- for i in range (num_dp ):
3799
- diff = dataset [:, i , np .newaxis ] - points
3800
- tdiff = np .dot (inv_cov , diff )
3801
- energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3802
- result = result + np .exp (- energy )
3803
- else :
3804
- # loop over points
3805
- for i in range (num_dp_pts ):
3806
- diff = dataset - points [:, i , np .newaxis ]
3807
- tdiff = np .dot (inv_cov , diff )
3808
- energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3809
- result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3810
-
3811
- result = result / norm_factor
3812
-
3813
- return {
3814
- 'xmin' : xmin ,
3815
- 'xmax' : xmax ,
3816
- 'mean' : np .mean (dataset ),
3817
- 'median' : np .median (dataset ),
3818
- 'result' : result
3819
- }
3768
+ def _compute_covariance (self ):
3769
+ """Computes the covariance matrix for each Gaussian kernel using
3770
+ covariance_factor().
3771
+ """
3772
+ self .factor = self .covariance_factor ()
3773
+ # Cache covariance and inverse covariance of the data
3774
+ if not hasattr (self , '_data_inv_cov' ):
3775
+ self ._data_covariance = np .atleast_2d (np .cov (self .dataset , rowvar = 1 ,
3776
+ bias = False ))
3777
+ self ._data_inv_cov = np .linalg .inv (self ._data_covariance )
3778
+
3779
+ self .covariance = self ._data_covariance * self .factor ** 2
3780
+ self .inv_cov = self ._data_inv_cov / self .factor ** 2
3781
+ self ._norm_factor = np .sqrt (np .linalg .det (2 * np .pi * self .covariance )) * self .num_dp
3782
+
3783
+ def evaluate (self , points ):
3784
+ """Evaluate the estimated pdf on a set of points.
3785
+
3786
+ Parameters
3787
+ ----------
3788
+ points : (# of dimensions, # of points)-array
3789
+ Alternatively, a (# of dimensions,) vector can be passed in and
3790
+ treated as a single point.
3791
+
3792
+ Returns
3793
+ -------
3794
+ values : (# of points,)-array
3795
+ The values at each point.
3796
+
3797
+ Raises
3798
+ ------
3799
+ ValueError : if the dimensionality of the input points is different than
3800
+ the dimensionality of the KDE.
3801
+
3802
+ """
3803
+ points = np .atleast_2d (points )
3804
+
3805
+ d , m = points .shape
3806
+ if d != self .dim :
3807
+ if d == 1 and m == self .dim :
3808
+ # points was passed in as a row vector
3809
+ points = np .reshape (points , (self .dim , 1 ))
3810
+ m = 1
3811
+ else :
3812
+ msg = "points have dimension %s, dataset has dimension %s" % (d ,
3813
+ self .dim )
3814
+ raise ValueError (msg )
3815
+
3816
+ result = np .zeros ((m ,), dtype = np .float )
3817
+
3818
+ if m >= self .num_dp :
3819
+ # there are more points than data, so loop over data
3820
+ for i in range (self .num_dp ):
3821
+ diff = self .dataset [:, i , np .newaxis ] - points
3822
+ tdiff = np .dot (self .inv_cov , diff )
3823
+ energy = np .sum (diff * tdiff ,axis = 0 ) / 2.0
3824
+ result = result + np .exp (- energy )
3825
+ else :
3826
+ # loop over points
3827
+ for i in range (m ):
3828
+ diff = self .dataset - points [:, i , np .newaxis ]
3829
+ tdiff = np .dot (self .inv_cov , diff )
3830
+ energy = np .sum (diff * tdiff , axis = 0 ) / 2.0
3831
+ result [i ] = np .sum (np .exp (- energy ), axis = 0 )
3832
+
3833
+ result = result / self ._norm_factor
3834
+
3835
+ return result
3836
+
3837
+ __call__ = evaluate
3820
3838
3821
3839
##################################################
3822
3840
# Code related to things in and around polygons
0 commit comments