Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit 9fdb0cd

Browse filesBrowse files
committed
doc: follow up for normalizing histogram
1 parent f2da1f0 commit 9fdb0cd
Copy full SHA for 9fdb0cd

File tree

Expand file treeCollapse file tree

2 files changed

+143
-74
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+143
-74
lines changed

‎galleries/examples/statistics/histogram_normalization.py renamed to ‎galleries/tutorials/histogram_normalization.py

Copy file name to clipboardExpand all lines: galleries/tutorials/histogram_normalization.py
+141-74Lines changed: 141 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
.. redirect-from:: /gallery/statistics/histogram_features
33
4+
.. _histogram-normalization::
5+
46
===================================
57
Histogram bins, density, and weight
68
===================================
@@ -34,60 +36,84 @@
3436

3537
# changing the style of the histogram bars just to make it
3638
# very clear where the boundaries of the bins are:
37-
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
39+
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3, 'alpha': .5}
40+
41+
fig, ax = plt.subplots(figsize=(6, 3))
3842

39-
fig, ax = plt.subplots()
43+
# count the number of values in xdata between each value in xbins
4044
ax.hist(xdata, bins=xbins, **style)
4145

4246
# plot the xdata locations on the x axis:
43-
ax.plot(xdata, 0*xdata, 'd')
44-
ax.set_ylabel('Number per bin')
45-
ax.set_xlabel('x bins (dx=1.0)')
47+
ax.stem(xdata, [.5]*len(xdata), 'd')
48+
49+
ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)',
50+
title='histogram',)
4651

4752
# %%
4853
# Modifying bins
4954
# ==============
5055
#
5156
# Changing the bin size changes the shape of this sparse histogram, so its a
52-
# good idea to choose bins with some care with respect to your data. Here we
53-
# make the bins half as wide.
57+
# good idea to choose bins with some care with respect to your data. The `.Axes.hist`
58+
# *bins* parameter accepts either the number of bins or a list of bin edges.
59+
#
60+
#
61+
# Set *bins* using fixed edges
62+
# -----------------------------
63+
#
64+
# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4].
65+
# This is half as wide as the previous example.
5466

5567
xbins = np.arange(1, 4.5, 0.5)
5668

57-
fig, ax = plt.subplots()
69+
fig, ax = plt.subplots(figsize=(6, 3))
70+
5871
ax.hist(xdata, bins=xbins, **style)
59-
ax.plot(xdata, 0*xdata, 'd')
60-
ax.set_ylabel('Number per bin')
61-
ax.set_xlabel('x bins (dx=0.5)')
72+
73+
ax.stem(xdata, [.5]*len(xdata), 'd')
74+
75+
ax.set(ylabel='Number per bin', xlabel='x bins (dx=0.5)',
76+
title='fixed bin edges',)
6277

6378
# %%
79+
#
80+
# Set *bins* using number of bins
81+
# -------------------------------
82+
#
6483
# We can also let numpy (via Matplotlib) choose the bins automatically, or
6584
# specify a number of bins to choose automatically:
6685

67-
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
68-
sharex=True, sharey=True, layout='constrained')
86+
fig, ax = plt.subplot_mosaic([['auto'], ['n4']],
87+
sharex=True, sharey=True,
88+
layout='constrained', figsize=(6, 6))
6989

7090
ax['auto'].hist(xdata, **style)
71-
ax['auto'].plot(xdata, 0*xdata, 'd')
72-
ax['auto'].set_ylabel('Number per bin')
73-
ax['auto'].set_xlabel('x bins (auto)')
91+
ax['auto'].stem(xdata, [.5]*len(xdata), 'd')
92+
93+
ax['auto'].set(ylabel='Number per bin', xlabel='x bins (auto)',
94+
title='dynamically computed bin edges')
7495

7596
ax['n4'].hist(xdata, bins=4, **style)
76-
ax['n4'].plot(xdata, 0*xdata, 'd')
77-
ax['n4'].set_xlabel('x bins ("bins=4")')
97+
ax['n4'].stem(xdata, [.5]*len(xdata), 'd')
98+
99+
ax['n4'].set(ylabel='Number per bin', xlabel='x bins ("bins=4")',
100+
title='fixed number of bins',)
78101

79102
# %%
80-
# Normalizing histograms: density and weight
81-
# ==========================================
103+
# Normalize histograms using density
104+
# ==================================
82105
#
83106
# Counts-per-bin is the default length of each bar in the histogram. However,
84107
# we can also normalize the bar lengths as a probability density function using
85108
# the ``density`` parameter:
86109

87-
fig, ax = plt.subplots()
110+
fig, ax = plt.subplots(figsize=(6, 3))
111+
88112
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability density [$V^{-1}$])')
90-
ax.set_xlabel('x bins (dx=0.5 $V$)')
113+
114+
ax.set(ylabel='Probability density [$V^{-1}$])',
115+
xlabel='x bins (dx=0.5 $V$)',
116+
title='normalizing histogram using density')
91117

92118
# %%
93119
# This normalization can be a little hard to interpret when just exploring the
@@ -117,55 +143,83 @@
117143
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
118144

119145
# %%
146+
# *density* parameter
147+
# -------------------
148+
#
120149
# If we don't use ``density=True``, we need to scale the expected probability
121150
# distribution function by both the length of the data and the width of the
122151
# bins:
123152

124-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
125153
dx = 0.1
126154
xbins = np.arange(-4, 4, dx)
127-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
128155

156+
fig, ax = plt.subplot_mosaic([['False'], ['True']], layout='constrained',
157+
figsize=(6, 6))
158+
159+
160+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step',
161+
label='Counts', alpha=style['alpha'])
129162
# scale and plot the expected pdf:
130163
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
131-
ax['False'].set_ylabel('Count per bin')
132-
ax['False'].set_xlabel('x bins [V]')
133-
ax['False'].legend()
134164

135-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
165+
166+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step',
167+
label='density', alpha=style['alpha'])
136168
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
137-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
138-
ax['True'].set_xlabel('x bins [$V$]')
169+
170+
171+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
172+
title="normalization using scaling, density=False")
173+
ax['False'].legend()
174+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
175+
title="density=True")
139176
ax['True'].legend()
140177

141178
# %%
142-
# One advantage of using the density is therefore that the shape and amplitude
143-
# of the histogram does not depend on the size of the bins. Consider an
144-
# extreme case where the bins do not have the same width. In this example, the
145-
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
179+
# Preserving distribution shape
180+
# -----------------------------
181+
# One advantage of using the density is that the shape and amplitude of the histogram
182+
# does not depend on the size of the bins.
183+
#
184+
# Irregularly spaced bins
185+
# ^^^^^^^^^^^^^^^^^^^^^^^
186+
# Consider an extreme case where the bins do not have the same width. In this example,
187+
# the bins below ``x=-1.25`` are six times wider than the rest of the bins. By
146188
# normalizing by density, we preserve the shape of the distribution, whereas if
147189
# we do not, then the wider bins have much higher counts than the thinner bins:
148190

149-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
150191
dx = 0.1
151192
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
152-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
193+
194+
fig, ax = plt.subplot_mosaic([['False'], ['True']],
195+
layout='constrained', figsize=(6, 6))
196+
197+
198+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step',
199+
label='Counts', alpha=style['alpha'])
153200
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
154-
ax['False'].set_ylabel('Count per bin')
155-
ax['False'].set_xlabel('x bins [V]')
156-
ax['False'].legend()
157201

158-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
202+
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step',
203+
label='density', alpha=style['alpha'])
159204
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
160-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
161-
ax['True'].set_xlabel('x bins [$V$]')
205+
206+
207+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
208+
title="irregularly spaced bins, density=False")
209+
ax['False'].legend()
210+
211+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
212+
title="irregularly spaced bins, density=True",)
162213
ax['True'].legend()
163214

164215
# %%
216+
# Histograms with different bin widths
217+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165218
# Similarly, if we want to compare histograms with different bin widths, we may
166219
# want to use ``density=True``:
167220

168-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
221+
fig, ax = plt.subplot_mosaic([['False'], ['True']],
222+
layout='constrained', figsize=(6, 6))
169223

170224
# expected PDF
171225
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
@@ -174,70 +228,83 @@
174228
xbins = np.arange(-4, 4, dx)
175229
# expected histogram:
176230
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
177-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
231+
ax['False'].hist(xdata, bins=xbins, density=False,
232+
histtype='step', alpha=style['alpha'])
178233

179-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
234+
ax['True'].hist(xdata, bins=xbins, density=True,
235+
histtype='step', label=dx, alpha=style['alpha'])
180236

181237
# Labels:
182-
ax['False'].set_xlabel('x bins [$V$]')
183-
ax['False'].set_ylabel('Count per bin')
184-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
185-
ax['True'].set_xlabel('x bins [$V$]')
238+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]',
239+
title="density=False")
240+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
241+
title='density=True')
186242
ax['True'].legend(fontsize='small', title='bin width:')
187243

188244
# %%
245+
# Normalize histograms using weights
246+
# ==================================
247+
#
189248
# Sometimes people want to normalize so that the sum of counts is one. This is
190249
# analogous to a `probability mass function
191250
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
192-
# variable where the sum of probabilities for all the values equals one. Using
193-
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
251+
# variable where the sum of probabilities for all the values equals one.
252+
#
253+
# *weights* parameter
254+
# -------------------
255+
# Using ``hist``, we can get this normalization if we set the *weights* to 1/N.
194256
# Note that the amplitude of this normalized histogram still depends on
195-
# width and/or number of the bins:
257+
# width and/or number of bins:
196258

197259
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
198260

199261
for nn, dx in enumerate([0.1, 0.4, 1.2]):
200262
xbins = np.arange(-4, 4, dx)
201263
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
202-
histtype='step', label=f'{dx}')
203-
ax.set_xlabel('x bins [$V$]')
204-
ax.set_ylabel('Bin count / N')
264+
histtype='step', label=f'{dx}', alpha=style['alpha'])
265+
266+
ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]',
267+
title="histogram normalization using weights")
205268
ax.legend(fontsize='small', title='bin width:')
206269

207270
# %%
271+
# Populations of different sizes
272+
# ------------------------------
208273
# The value of normalizing histograms is comparing two distributions that have
209-
# different sized populations. Here we compare the distribution of ``xdata``
274+
# different sized populations. Here we compare the distribution of ``xdata``
210275
# with a population of 1000, and ``xdata2`` with 100 members.
211276

212277
xdata2 = rng.normal(size=100)
213278

214-
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
215-
layout='constrained', figsize=(8, 4))
279+
fig, ax = plt.subplot_mosaic([['no_norm'], ['density'], ['weight']],
280+
layout='constrained', figsize=(6, 9))
216281

217282
xbins = np.arange(-4, 4, 0.25)
218283

219-
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
220-
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
221-
ax['no_norm'].set_ylabel('Counts')
222-
ax['no_norm'].set_xlabel('x bins [$V$]')
223-
ax['no_norm'].set_title('No normalization')
284+
ax['no_norm'].hist(xdata, bins=xbins, histtype='step', alpha=style['alpha'])
285+
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step', alpha=style['alpha'])
286+
224287

225-
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
226-
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
227-
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
228-
ax['density'].set_title('Density=True')
229-
ax['density'].set_xlabel('x bins [$V$]')
288+
ax['density'].hist(xdata, bins=xbins, histtype='step',
289+
density=True, alpha=style['alpha'])
290+
ax['density'].hist(xdata2, bins=xbins, histtype='step',
291+
density=True, alpha=style['alpha'])
230292

231293
ax['weight'].hist(xdata, bins=xbins, histtype='step',
232294
weights=1 / len(xdata) * np.ones(len(xdata)),
233-
label='N=1000')
295+
label='N=1000', alpha=style['alpha'])
234296
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
235297
weights=1 / len(xdata2) * np.ones(len(xdata2)),
236-
label='N=100')
237-
ax['weight'].set_xlabel('x bins [$V$]')
238-
ax['weight'].set_ylabel('Counts / N')
298+
label='N=100', alpha=style['alpha'])
299+
300+
301+
ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]',
302+
title='No normalization')
303+
ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
304+
title='Density=True')
305+
ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]',
306+
title='Weight = 1/N')
239307
ax['weight'].legend(fontsize='small')
240-
ax['weight'].set_title('Weight = 1/N')
241308

242309
plt.show()
243310

‎galleries/tutorials/index.rst

Copy file name to clipboardExpand all lines: galleries/tutorials/index.rst
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ a :ref:`FAQ <faq-index>` in our :ref:`user guide <users-guide-index>`.
9595
/tutorials/images
9696
/tutorials/lifecycle
9797
/tutorials/artists
98+
/tutorials/histogram_normalization
9899

99100
.. only:: html
100101

@@ -134,6 +135,7 @@ Intermediate
134135
- :ref:`arranging_axes`
135136
- :ref:`autoscale`
136137
- :ref:`imshow_extent`
138+
- :ref:`histogram_normalization`
137139

138140
Advanced
139141
^^^^^^^^

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.