|
1 | 1 | """
|
2 |
| -.. redirect-from:: /gallery/statistics/histogram_features |
| 2 | +======================= |
| 3 | +Histogram normalization |
| 4 | +======================= |
3 | 5 |
|
4 |
| -=================================== |
5 |
| -Histogram bins, density, and weight |
6 |
| -=================================== |
| 6 | +Histogram normalization rescales data into probabilities and therefore is a popular |
| 7 | +technique for comparing populations of different sizes or histograms computed using |
| 8 | +different bin edges. For more information on using `.Axes.hist` see |
| 9 | +:ref:`histogram_features`. |
7 | 10 |
|
8 |
| -The `.Axes.hist` method can flexibly create histograms in a few different ways, |
9 |
| -which is flexible and helpful, but can also lead to confusion. In particular, |
10 |
| -you can: |
| 11 | +Irregularly spaced bins |
| 12 | +----------------------- |
| 13 | +In this example, the bins below ``x=-1.25`` are six times wider than the rest of the |
| 14 | +bins :: |
11 | 15 |
|
12 |
| -- bin the data as you want, either with an automatically chosen number of |
13 |
| - bins, or with fixed bin edges, |
14 |
| -- normalize the histogram so that its integral is one, |
15 |
| -- and assign weights to the data points, so that each data point affects the |
16 |
| - count in its bin differently. |
| 16 | + dx = 0.1 |
| 17 | + xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
17 | 18 |
|
18 |
| -The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results, |
19 |
| -therefore users should consult the numpy documentation for a definitive guide. |
20 |
| -
|
21 |
| -Histograms are created by defining bin edges, and taking a dataset of values |
22 |
| -and sorting them into the bins, and counting or summing how much data is in |
23 |
| -each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3 |
24 |
| -bins: |
| 19 | +By normalizing by density, we preserve the shape of the distribution, whereas if we do |
| 20 | +not, then the wider bins have much higher counts than the thinner bins: |
25 | 21 | """
|
26 | 22 |
|
27 | 23 | import matplotlib.pyplot as plt
|
28 | 24 | import numpy as np
|
29 | 25 |
|
30 | 26 | rng = np.random.default_rng(19680801)
|
31 | 27 |
|
32 |
| -xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3]) |
33 |
| -xbins = np.array([1, 2, 3, 4]) |
34 |
| - |
35 |
| -# changing the style of the histogram bars just to make it |
36 |
| -# very clear where the boundaries of the bins are: |
37 |
| -style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3} |
38 |
| - |
39 |
| -fig, ax = plt.subplots() |
40 |
| -ax.hist(xdata, bins=xbins, **style) |
41 |
| - |
42 |
| -# plot the xdata locations on the x axis: |
43 |
| -ax.plot(xdata, 0*xdata, 'd') |
44 |
| -ax.set_ylabel('Number per bin') |
45 |
| -ax.set_xlabel('x bins (dx=1.0)') |
46 |
| - |
47 |
| -# %% |
48 |
| -# Modifying bins |
49 |
| -# ============== |
50 |
| -# |
51 |
| -# Changing the bin size changes the shape of this sparse histogram, so its a |
52 |
| -# good idea to choose bins with some care with respect to your data. Here we |
53 |
| -# make the bins half as wide. |
54 |
| - |
55 |
| -xbins = np.arange(1, 4.5, 0.5) |
56 |
| - |
57 |
| -fig, ax = plt.subplots() |
58 |
| -ax.hist(xdata, bins=xbins, **style) |
59 |
| -ax.plot(xdata, 0*xdata, 'd') |
60 |
| -ax.set_ylabel('Number per bin') |
61 |
| -ax.set_xlabel('x bins (dx=0.5)') |
62 |
| - |
63 |
| -# %% |
64 |
| -# We can also let numpy (via Matplotlib) choose the bins automatically, or |
65 |
| -# specify a number of bins to choose automatically: |
66 |
| - |
67 |
| -fig, ax = plt.subplot_mosaic([['auto', 'n4']], |
68 |
| - sharex=True, sharey=True, layout='constrained') |
69 |
| - |
70 |
| -ax['auto'].hist(xdata, **style) |
71 |
| -ax['auto'].plot(xdata, 0*xdata, 'd') |
72 |
| -ax['auto'].set_ylabel('Number per bin') |
73 |
| -ax['auto'].set_xlabel('x bins (auto)') |
74 |
| - |
75 |
| -ax['n4'].hist(xdata, bins=4, **style) |
76 |
| -ax['n4'].plot(xdata, 0*xdata, 'd') |
77 |
| -ax['n4'].set_xlabel('x bins ("bins=4")') |
78 |
| - |
79 |
| -# %% |
80 |
| -# Normalizing histograms: density and weight |
81 |
| -# ========================================== |
82 |
| -# |
83 |
| -# Counts-per-bin is the default length of each bar in the histogram. However, |
84 |
| -# we can also normalize the bar lengths as a probability density function using |
85 |
| -# the ``density`` parameter: |
86 |
| - |
87 |
| -fig, ax = plt.subplots() |
88 |
| -ax.hist(xdata, bins=xbins, density=True, **style) |
89 |
| -ax.set_ylabel('Probability density [$V^{-1}$])') |
90 |
| -ax.set_xlabel('x bins (dx=0.5 $V$)') |
91 |
| - |
92 |
| -# %% |
93 |
| -# This normalization can be a little hard to interpret when just exploring the |
94 |
| -# data. The value attached to each bar is divided by the total number of data |
95 |
| -# points *and* the width of the bin, and thus the values _integrate_ to one |
96 |
| -# when integrating across the full range of data. |
97 |
| -# e.g. :: |
98 |
| -# |
99 |
| -# density = counts / (sum(counts) * np.diff(bins)) |
100 |
| -# np.sum(density * np.diff(bins)) == 1 |
101 |
| -# |
102 |
| -# This normalization is how `probability density functions |
103 |
| -# <https://en.wikipedia.org/wiki/Probability_density_function>`_ are defined in |
104 |
| -# statistics. If :math:`X` is a random variable on :math:`x`, then :math:`f_X` |
105 |
| -# is is the probability density function if :math:`P[a<X<b] = \int_a^b f_X dx`. |
106 |
| -# If the units of x are Volts, then the units of :math:`f_X` are :math:`V^{-1}` |
107 |
| -# or probability per change in voltage. |
108 |
| -# |
109 |
| -# The usefulness of this normalization is a little more clear when we draw from |
110 |
| -# a known distribution and try to compare with theory. So, choose 1000 points |
111 |
| -# from a `normal distribution |
112 |
| -# <https://en.wikipedia.org/wiki/Normal_distribution>`_, and also calculate the |
113 |
| -# known probability density function: |
114 |
| - |
115 | 28 | xdata = rng.normal(size=1000)
|
116 | 29 | xpdf = np.arange(-4, 4, 0.1)
|
117 | 30 | pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
|
118 | 31 |
|
119 |
| -# %% |
120 |
| -# If we don't use ``density=True``, we need to scale the expected probability |
121 |
| -# distribution function by both the length of the data and the width of the |
122 |
| -# bins: |
| 32 | +dx = 0.1 |
| 33 | +xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
123 | 34 |
|
124 | 35 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
|
125 |
| -dx = 0.1 |
126 |
| -xbins = np.arange(-4, 4, dx) |
127 |
| -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
128 | 36 |
|
129 |
| -# scale and plot the expected pdf: |
130 |
| -ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$') |
131 |
| -ax['False'].set_ylabel('Count per bin') |
132 |
| -ax['False'].set_xlabel('x bins [V]') |
133 |
| -ax['False'].legend() |
| 37 | +fig.suptitle("Histogram with irregularly spaced bins") |
| 38 | + |
| 39 | + |
| 40 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 41 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$', |
| 42 | + alpha=.5) |
134 | 43 |
|
135 | 44 | ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
|
136 |
| -ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
137 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
138 |
| -ax['True'].set_xlabel('x bins [$V$]') |
139 |
| -ax['True'].legend() |
| 45 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5) |
140 | 46 |
|
141 |
| -# %% |
142 |
| -# One advantage of using the density is therefore that the shape and amplitude |
143 |
| -# of the histogram does not depend on the size of the bins. Consider an |
144 |
| -# extreme case where the bins do not have the same width. In this example, the |
145 |
| -# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
146 |
| -# normalizing by density, we preserve the shape of the distribution, whereas if |
147 |
| -# we do not, then the wider bins have much higher counts than the thinner bins: |
148 | 47 |
|
149 |
| -fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
150 |
| -dx = 0.1 |
151 |
| -xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)]) |
152 |
| -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
153 |
| -ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$') |
154 |
| -ax['False'].set_ylabel('Count per bin') |
155 |
| -ax['False'].set_xlabel('x bins [V]') |
| 48 | +ax['False'].set(xlabel='x [V]', ylabel='Count per bin', title="density=False") |
| 49 | + |
| 50 | +# add the bin widths on the minor axes to highlight irregularity |
| 51 | +ax['False'].set_xticks(xbins, minor=True) |
156 | 52 | ax['False'].legend()
|
157 | 53 |
|
158 |
| -ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
159 |
| -ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
160 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
161 |
| -ax['True'].set_xlabel('x bins [$V$]') |
| 54 | +ax['True'].set(xlabel='x [$V$]', ylabel='Probability density [$V^{-1}$]', |
| 55 | + title="density=True") |
| 56 | +ax['False'].set_xticks(xbins, minor=True) |
162 | 57 | ax['True'].legend()
|
163 | 58 |
|
| 59 | + |
164 | 60 | # %%
|
165 |
| -# Similarly, if we want to compare histograms with different bin widths, we may |
166 |
| -# want to use ``density=True``: |
| 61 | +# Different bin widths |
| 62 | +# -------------------- |
| 63 | +# |
| 64 | +# Here we use normalization to compare histograms with binwidths of 0.1, 0.4, and 1.2: |
167 | 65 |
|
168 | 66 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
|
169 | 67 |
|
| 68 | +fig.suptitle("Comparing histograms with different bin widths") |
170 | 69 | # expected PDF
|
171 | 70 | ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
|
172 | 71 |
|
173 | 72 | for nn, dx in enumerate([0.1, 0.4, 1.2]):
|
174 | 73 | xbins = np.arange(-4, 4, dx)
|
175 | 74 | # expected histogram:
|
176 |
| - ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}') |
177 |
| - ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') |
178 |
| - |
179 |
| - ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx) |
| 75 | + ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}', alpha=.5) |
| 76 | + ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label=dx) |
180 | 77 |
|
181 |
| -# Labels: |
182 |
| -ax['False'].set_xlabel('x bins [$V$]') |
183 |
| -ax['False'].set_ylabel('Count per bin') |
184 |
| -ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
185 |
| -ax['True'].set_xlabel('x bins [$V$]') |
186 |
| -ax['True'].legend(fontsize='small', title='bin width:') |
| 78 | + ax['True'].hist(xdata, bins=xbins, density=True, histtype='step') |
187 | 79 |
|
| 80 | +ax['False'].set(xlabel='x [$V$]', ylabel='Count per bin', |
| 81 | + title="density=False") |
| 82 | +ax['True'].set(xlabel='x [$V$]', ylabel='Probability density [$V^{-1}$]', |
| 83 | + title='density=True') |
| 84 | +ax['False'].legend(fontsize='small', title='bin width:') |
188 | 85 | # %%
|
189 |
| -# Sometimes people want to normalize so that the sum of counts is one. This is |
190 |
| -# analogous to a `probability mass function |
191 |
| -# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
192 |
| -# variable where the sum of probabilities for all the values equals one. Using |
193 |
| -# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
194 |
| -# Note that the amplitude of this normalized histogram still depends on |
195 |
| -# width and/or number of the bins: |
| 86 | +# Populations of different sizes |
| 87 | +# ------------------------------ |
| 88 | +# |
| 89 | +# Here we compare the distribution of ``xdata`` with a population of 1000, and |
| 90 | +# ``xdata2`` with 100 members. We demonstrate using *density* to generate the |
| 91 | +# probability density function(`pdf`_) and *weight* to generate an analog to the |
| 92 | +# probability mass function (`pmf`_). |
| 93 | +# |
| 94 | +# .. _pdf: https://en.wikipedia.org/wiki/Probability_density_function |
| 95 | +# .. _pmf: https://en.wikipedia.org/wiki/Probability_mass_function |
196 | 96 |
|
197 |
| -fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3)) |
| 97 | +xdata2 = rng.normal(size=100) |
198 | 98 |
|
199 |
| -for nn, dx in enumerate([0.1, 0.4, 1.2]): |
200 |
| - xbins = np.arange(-4, 4, dx) |
201 |
| - ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)), |
202 |
| - histtype='step', label=f'{dx}') |
203 |
| -ax.set_xlabel('x bins [$V$]') |
204 |
| -ax.set_ylabel('Bin count / N') |
205 |
| -ax.legend(fontsize='small', title='bin width:') |
| 99 | +fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], layout='constrained') |
206 | 100 |
|
207 |
| -# %% |
208 |
| -# The value of normalizing histograms is comparing two distributions that have |
209 |
| -# different sized populations. Here we compare the distribution of ``xdata`` |
210 |
| -# with a population of 1000, and ``xdata2`` with 100 members. |
| 101 | +fig.suptitle("Comparing histograms of populations of different sizes") |
211 | 102 |
|
212 |
| -xdata2 = rng.normal(size=100) |
| 103 | +xbins = np.arange(-4, 4, 0.25) |
213 | 104 |
|
214 |
| -fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']], |
215 |
| - layout='constrained', figsize=(8, 4)) |
| 105 | +for xd in [xdata, xdata2]: |
| 106 | + ax['no_norm'].hist(xd, bins=xbins, histtype='step') |
| 107 | + ax['density'].hist(xd, bins=xbins, histtype='step', density=True) |
| 108 | + ax['weight'].hist(xd, bins=xbins, histtype='step', weights=np.ones(len(xd))/len(xd), |
| 109 | + label=f'N={len(xd)}') |
216 | 110 |
|
217 |
| -xbins = np.arange(-4, 4, 0.25) |
218 | 111 |
|
219 |
| -ax['no_norm'].hist(xdata, bins=xbins, histtype='step') |
220 |
| -ax['no_norm'].hist(xdata2, bins=xbins, histtype='step') |
221 |
| -ax['no_norm'].set_ylabel('Counts') |
222 |
| -ax['no_norm'].set_xlabel('x bins [$V$]') |
223 |
| -ax['no_norm'].set_title('No normalization') |
224 |
| - |
225 |
| -ax['density'].hist(xdata, bins=xbins, histtype='step', density=True) |
226 |
| -ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True) |
227 |
| -ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
228 |
| -ax['density'].set_title('Density=True') |
229 |
| -ax['density'].set_xlabel('x bins [$V$]') |
230 |
| - |
231 |
| -ax['weight'].hist(xdata, bins=xbins, histtype='step', |
232 |
| - weights=1 / len(xdata) * np.ones(len(xdata)), |
233 |
| - label='N=1000') |
234 |
| -ax['weight'].hist(xdata2, bins=xbins, histtype='step', |
235 |
| - weights=1 / len(xdata2) * np.ones(len(xdata2)), |
236 |
| - label='N=100') |
237 |
| -ax['weight'].set_xlabel('x bins [$V$]') |
238 |
| -ax['weight'].set_ylabel('Counts / N') |
| 112 | +ax['no_norm'].set(xlabel='x [$V$]', ylabel='Counts', title='No normalization') |
| 113 | +ax['density'].set(xlabel='x [$V$]', |
| 114 | + ylabel='Probability density [$V^{-1}$]', title='Density=True') |
| 115 | +ax['weight'].set(xlabel='x bins [$V$]', ylabel='Counts / N', title='Weight = 1/N') |
| 116 | + |
239 | 117 | ax['weight'].legend(fontsize='small')
|
240 |
| -ax['weight'].set_title('Weight = 1/N') |
241 | 118 |
|
242 | 119 | plt.show()
|
243 | 120 |
|
244 | 121 | # %%
|
245 | 122 | #
|
| 123 | +# .. tags:: plot-type: histogram |
| 124 | +# |
246 | 125 | # .. admonition:: References
|
247 | 126 | #
|
248 | 127 | # The use of the following functions, methods, classes and modules is shown
|
249 | 128 | # in this example:
|
250 | 129 | #
|
251 | 130 | # - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
|
252 |
| -# - `matplotlib.axes.Axes.set_title` |
253 |
| -# - `matplotlib.axes.Axes.set_xlabel` |
254 |
| -# - `matplotlib.axes.Axes.set_ylabel` |
| 131 | +# - `matplotlib.axes.Axes.set` |
255 | 132 | # - `matplotlib.axes.Axes.legend`
|
| 133 | +# |
0 commit comments