|
86 | 86 |
|
87 | 87 | fig, ax = plt.subplots()
|
88 | 88 | ax.hist(xdata, bins=xbins, density=True, **style)
|
89 |
| -ax.set_ylabel('Probability (per dx)') |
90 |
| -ax.set_xlabel('x bins (dx=0.5)') |
| 89 | +ax.set_ylabel('Probability density [$V^{-1}$])') |
| 90 | +ax.set_xlabel('x bins (dx=0.5 $V$)') |
91 | 91 |
|
92 | 92 | # %%
|
93 | 93 | # This normalization can be a little hard to interpret when just exploring the
|
|
115 | 115 | pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
|
116 | 116 |
|
117 | 117 | # %%
|
118 |
| -# to make the point very obvious, consider bins that do not have the same |
119 |
| -# spacing. By normalizing by density, we preserve the shape of the |
120 |
| -# distribution, whereas if we do not, then the wider bins have much higher |
121 |
| -# values than the thin bins: |
| 118 | +# If we don't use ``density=True``, we need to scale the expected probability |
| 119 | +# distribution function by both the length of the data and the width of the |
| 120 | +# bins: |
| 121 | + |
| 122 | +fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained') |
| 123 | +dx = 0.1 |
| 124 | +xbins = np.arange(-4, 4, dx) |
| 125 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 126 | + |
| 127 | +# scale and plot the expected pdf: |
| 128 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$') |
| 129 | +ax['False'].set_ylabel('Count per bin') |
| 130 | +ax['False'].set_xlabel('x bins [V]') |
| 131 | +ax['False'].legend() |
| 132 | + |
| 133 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 134 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 135 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 136 | +ax['True'].set_xlabel('x bins [$V$]') |
| 137 | +ax['True'].legend() |
| 138 | + |
| 139 | +# %% |
| 140 | +# One advantage of using the density is therefore that the shape and amplitude |
| 141 | +# of the histogram does not depend on the size of the bins. Consider an |
| 142 | +# extreme case where the bins do not have the same width. In this example, the |
| 143 | +# bins below ``x=-1.25`` are six times wider than the rest of the bins. By |
| 144 | +# normalizing by density, we preserve the shape of the distribution, whereas if |
| 145 | +# we do not, then the wider bins have much higher counts than the thinner bins: |
122 | 146 |
|
123 | 147 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
|
124 | 148 | dx = 0.1
|
125 | 149 | xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
|
126 |
| -ax['False'].hist(xdata, bins=xbins, density=False, histtype='step') |
| 150 | +ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts') |
| 151 | +ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$') |
127 | 152 | ax['False'].set_ylabel('Count per bin')
|
128 |
| -ax['False'].set_xlabel('x bins (below -1.25 bins are wider)') |
| 153 | +ax['False'].set_xlabel('x bins [V]') |
| 154 | +ax['False'].legend() |
129 | 155 |
|
130 |
| -ax['True'].hist(xdata, bins=xbins, density=True, histtype='step') |
131 |
| -ax['True'].plot(xpdf, pdf) |
132 |
| -ax['True'].set_ylabel('Probability (per dx)') |
133 |
| -ax['True'].set_xlabel('x bins (below -1.25 bins are wider)') |
| 156 | +ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density') |
| 157 | +ax['True'].plot(xpdf, pdf, label='$f_X(x)$') |
| 158 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 159 | +ax['True'].set_xlabel('x bins [$V$]') |
| 160 | +ax['True'].legend() |
134 | 161 |
|
135 | 162 | # %%
|
136 |
| -# Using *density* also makes it easier to compare histograms with different bin |
137 |
| -# widths. Note that in order to get the theoretical distribution, we must |
138 |
| -# multiply the distribution by the number of data points and the bin width |
| 163 | +# Similarly, if we want to compare histograms with different bin widths, we may |
| 164 | +# want to use ``density=True``: |
139 | 165 |
|
140 | 166 | fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
|
141 | 167 |
|
142 | 168 | # expected PDF
|
143 |
| -ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k') |
| 169 | +ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k') |
144 | 170 |
|
145 | 171 | for nn, dx in enumerate([0.1, 0.4, 1.2]):
|
146 | 172 | xbins = np.arange(-4, 4, dx)
|
|
151 | 177 | ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
|
152 | 178 |
|
153 | 179 | # Labels:
|
154 |
| -ax['False'].set_xlabel('x bins') |
| 180 | +ax['False'].set_xlabel('x bins [$V$]') |
155 | 181 | ax['False'].set_ylabel('Count per bin')
|
156 |
| -ax['True'].set_ylabel('Probability (per dx)') |
157 |
| -ax['True'].set_xlabel('x bins') |
158 |
| -ax['True'].legend(fontsize='small') |
| 182 | +ax['True'].set_ylabel('Probability density [$V^{-1}$]') |
| 183 | +ax['True'].set_xlabel('x bins [$V$]') |
| 184 | +ax['True'].legend(fontsize='small', title='bin width:') |
159 | 185 |
|
160 | 186 | # %%
|
161 |
| - |
162 | 187 | # Sometimes people want to normalize so that the sum of counts is one. This is
|
163 |
| -# not done with the *density* kwarg, but rather we can get this effects if we |
164 |
| -# set the *weights* to 1/N. Note, however, that the amplitude of the histogram |
165 |
| -# still depends on width of the bins: |
| 188 | +# analogous to a `probability mass function |
| 189 | +# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete |
| 190 | +# variable where the sum of probabilities for all the values equals one. Using |
| 191 | +# ``hist``, we can get this normalization if we set the *weights* to 1/N. |
| 192 | +# Note that the amplitude of this normalized histogram still depends on |
| 193 | +# width and/or number of the bins: |
166 | 194 |
|
167 | 195 | fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
|
168 | 196 |
|
169 | 197 | for nn, dx in enumerate([0.1, 0.4, 1.2]):
|
170 | 198 | xbins = np.arange(-4, 4, dx)
|
171 | 199 | ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
|
172 | 200 | histtype='step', label=f'{dx}')
|
173 |
| -ax.set_xlabel('x bins') |
| 201 | +ax.set_xlabel('x bins [$V$]') |
174 | 202 | ax.set_ylabel('Bin count / N')
|
175 |
| -ax.legend(fontsize='small') |
| 203 | +ax.legend(fontsize='small', title='bin width:') |
176 | 204 |
|
177 | 205 | # %%
|
178 |
| -# The true value of normalizing is if you do want to compare two distributions |
179 |
| -# that have different sized populations. Here we compare the distribution of |
180 |
| -# ``xdata`` with a population of 1000, and ``xdata2`` with 100 members. |
| 206 | +# The value of normalizing histograms is comparing two distributions that have |
| 207 | +# different sized populations. Here we compare the distribution of ``xdata`` |
| 208 | +# with a population of 1000, and ``xdata2`` with 100 members. |
181 | 209 |
|
182 | 210 | xdata2 = rng.normal(size=100)
|
183 | 211 |
|
|
189 | 217 | ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
|
190 | 218 | ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
|
191 | 219 | ax['no_norm'].set_ylabel('Counts')
|
192 |
| -ax['no_norm'].set_xlabel('x bins') |
| 220 | +ax['no_norm'].set_xlabel('x bins [$V$]') |
193 | 221 | ax['no_norm'].set_title('No normalization')
|
194 | 222 |
|
195 | 223 | ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
|
196 | 224 | ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
|
197 |
| -ax['density'].set_ylabel('Probability (per dx)') |
| 225 | +ax['density'].set_ylabel('Probability density [$V^{-1}$]') |
198 | 226 | ax['density'].set_title('Density=True')
|
199 |
| -ax['density'].set_xlabel('x bins') |
| 227 | +ax['density'].set_xlabel('x bins [$V$]') |
200 | 228 |
|
201 | 229 | ax['weight'].hist(xdata, bins=xbins, histtype='step',
|
202 | 230 | weights=1 / len(xdata) * np.ones(len(xdata)),
|
203 | 231 | label='N=1000')
|
204 | 232 | ax['weight'].hist(xdata2, bins=xbins, histtype='step',
|
205 | 233 | weights=1 / len(xdata2) * np.ones(len(xdata2)),
|
206 | 234 | label='N=100')
|
207 |
| -ax['weight'].set_xlabel('x bins') |
| 235 | +ax['weight'].set_xlabel('x bins [$V$]') |
208 | 236 | ax['weight'].set_ylabel('Counts / N')
|
209 | 237 | ax['weight'].legend(fontsize='small')
|
210 | 238 | ax['weight'].set_title('Weight = 1/N')
|
|
0 commit comments