Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit b1afe65

Browse filesBrowse files
committed
DOC: more updates
1 parent aa52a1d commit b1afe65
Copy full SHA for b1afe65

File tree

Expand file treeCollapse file tree

1 file changed

+61
-33
lines changed
Filter options
Expand file treeCollapse file tree

1 file changed

+61
-33
lines changed

‎galleries/examples/statistics/histogram_normalization.py

Copy file name to clipboardExpand all lines: galleries/examples/statistics/histogram_normalization.py
+61-33Lines changed: 61 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@
8686

8787
fig, ax = plt.subplots()
8888
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability (per dx)')
90-
ax.set_xlabel('x bins (dx=0.5)')
89+
ax.set_ylabel('Probability density [$V^{-1}$])')
90+
ax.set_xlabel('x bins (dx=0.5 $V$)')
9191

9292
# %%
9393
# This normalization can be a little hard to interpret when just exploring the
@@ -115,32 +115,58 @@
115115
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
116116

117117
# %%
118-
# to make the point very obvious, consider bins that do not have the same
119-
# spacing. By normalizing by density, we preserve the shape of the
120-
# distribution, whereas if we do not, then the wider bins have much higher
121-
# values than the thin bins:
118+
# If we don't use ``density=True``, we need to scale the expected probability
119+
# distribution function by both the length of the data and the width of the
120+
# bins:
121+
122+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
123+
dx = 0.1
124+
xbins = np.arange(-4, 4, dx)
125+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
126+
127+
# scale and plot the expected pdf:
128+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
129+
ax['False'].set_ylabel('Count per bin')
130+
ax['False'].set_xlabel('x bins [V]')
131+
ax['False'].legend()
132+
133+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
134+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
135+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
136+
ax['True'].set_xlabel('x bins [$V$]')
137+
ax['True'].legend()
138+
139+
# %%
140+
# One advantage of using the density is therefore that the shape and amplitude
141+
# of the histogram does not depend on the size of the bins. Consider an
142+
# extreme case where the bins do not have the same width. In this example, the
143+
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
144+
# normalizing by density, we preserve the shape of the distribution, whereas if
145+
# we do not, then the wider bins have much higher counts than the thinner bins:
122146

123147
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
124148
dx = 0.1
125149
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
126-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
150+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
151+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
127152
ax['False'].set_ylabel('Count per bin')
128-
ax['False'].set_xlabel('x bins (below -1.25 bins are wider)')
153+
ax['False'].set_xlabel('x bins [V]')
154+
ax['False'].legend()
129155

130-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step')
131-
ax['True'].plot(xpdf, pdf)
132-
ax['True'].set_ylabel('Probability (per dx)')
133-
ax['True'].set_xlabel('x bins (below -1.25 bins are wider)')
156+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
157+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
158+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
159+
ax['True'].set_xlabel('x bins [$V$]')
160+
ax['True'].legend()
134161

135162
# %%
136-
# Using *density* also makes it easier to compare histograms with different bin
137-
# widths. Note that in order to get the theoretical distribution, we must
138-
# multiply the distribution by the number of data points and the bin width
163+
# Similarly, if we want to compare histograms with different bin widths, we may
164+
# want to use ``density=True``:
139165

140166
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
141167

142168
# expected PDF
143-
ax['True'].plot(xpdf, pdf, '--', label='PDF', color='k')
169+
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
144170

145171
for nn, dx in enumerate([0.1, 0.4, 1.2]):
146172
xbins = np.arange(-4, 4, dx)
@@ -151,33 +177,35 @@
151177
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
152178

153179
# Labels:
154-
ax['False'].set_xlabel('x bins')
180+
ax['False'].set_xlabel('x bins [$V$]')
155181
ax['False'].set_ylabel('Count per bin')
156-
ax['True'].set_ylabel('Probability (per dx)')
157-
ax['True'].set_xlabel('x bins')
158-
ax['True'].legend(fontsize='small')
182+
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
183+
ax['True'].set_xlabel('x bins [$V$]')
184+
ax['True'].legend(fontsize='small', title='bin width:')
159185

160186
# %%
161-
162187
# Sometimes people want to normalize so that the sum of counts is one. This is
163-
# not done with the *density* kwarg, but rather we can get this effects if we
164-
# set the *weights* to 1/N. Note, however, that the amplitude of the histogram
165-
# still depends on width of the bins:
188+
# analogous to a `probability mass function
189+
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
190+
# variable where the sum of probabilities for all the values equals one. Using
191+
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
192+
# Note that the amplitude of this normalized histogram still depends on
193+
# width and/or number of the bins:
166194

167195
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
168196

169197
for nn, dx in enumerate([0.1, 0.4, 1.2]):
170198
xbins = np.arange(-4, 4, dx)
171199
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
172200
histtype='step', label=f'{dx}')
173-
ax.set_xlabel('x bins')
201+
ax.set_xlabel('x bins [$V$]')
174202
ax.set_ylabel('Bin count / N')
175-
ax.legend(fontsize='small')
203+
ax.legend(fontsize='small', title='bin width:')
176204

177205
# %%
178-
# The true value of normalizing is if you do want to compare two distributions
179-
# that have different sized populations. Here we compare the distribution of
180-
# ``xdata`` with a population of 1000, and ``xdata2`` with 100 members.
206+
# The value of normalizing histograms is comparing two distributions that have
207+
# different sized populations. Here we compare the distribution of ``xdata``
208+
# with a population of 1000, and ``xdata2`` with 100 members.
181209

182210
xdata2 = rng.normal(size=100)
183211

@@ -189,22 +217,22 @@
189217
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
190218
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
191219
ax['no_norm'].set_ylabel('Counts')
192-
ax['no_norm'].set_xlabel('x bins')
220+
ax['no_norm'].set_xlabel('x bins [$V$]')
193221
ax['no_norm'].set_title('No normalization')
194222

195223
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
196224
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
197-
ax['density'].set_ylabel('Probability (per dx)')
225+
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
198226
ax['density'].set_title('Density=True')
199-
ax['density'].set_xlabel('x bins')
227+
ax['density'].set_xlabel('x bins [$V$]')
200228

201229
ax['weight'].hist(xdata, bins=xbins, histtype='step',
202230
weights=1 / len(xdata) * np.ones(len(xdata)),
203231
label='N=1000')
204232
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
205233
weights=1 / len(xdata2) * np.ones(len(xdata2)),
206234
label='N=100')
207-
ax['weight'].set_xlabel('x bins')
235+
ax['weight'].set_xlabel('x bins [$V$]')
208236
ax['weight'].set_ylabel('Counts / N')
209237
ax['weight'].legend(fontsize='small')
210238
ax['weight'].set_title('Weight = 1/N')

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.