Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Appearance settings

Commit aec07ce

Browse filesBrowse files
committed
doc: follow up for normalizing histogram
1 parent 60d2f95 commit aec07ce
Copy full SHA for aec07ce

File tree

Expand file treeCollapse file tree

2 files changed

+142
-85
lines changed
Filter options
Expand file treeCollapse file tree

2 files changed

+142
-85
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
"""
22
.. redirect-from:: /gallery/statistics/histogram_features
33
4+
.. _histogram_normalization:
5+
46
===================================
57
Histogram bins, density, and weight
68
===================================
@@ -34,65 +36,90 @@
3436

3537
# changing the style of the histogram bars just to make it
3638
# very clear where the boundaries of the bins are:
37-
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}
39+
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3, 'alpha': .5}
40+
41+
fig, ax = plt.subplots(figsize=(6, 3))
42+
43+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
3844

39-
fig, ax = plt.subplots()
45+
# count the number of values in xdata between each value in xbins
4046
ax.hist(xdata, bins=xbins, **style)
4147

42-
# plot the xdata locations on the x axis:
43-
ax.plot(xdata, 0*xdata, 'd')
44-
ax.set_ylabel('Number per bin')
45-
ax.set_xlabel('x bins (dx=1.0)')
48+
# plot the xdata events:
49+
ax.eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
50+
51+
ax.set(xlabel='Number per bin', ylabel='x bins (dx=1.0)', title='histogram')
4652

4753
# %%
48-
# Modifying bins
49-
# ==============
54+
# Choose bins
55+
# ===========
5056
#
5157
# Changing the bin size changes the shape of this sparse histogram, so its a
52-
# good idea to choose bins with some care with respect to your data. Here we
53-
# make the bins half as wide.
58+
# good idea to choose bins with some care with respect to your data. The `.Axes.hist`
59+
# *bins* parameter accepts either the number of bins or a list of bin edges.
60+
#
61+
#
62+
# Set *bins* using fixed edges
63+
# ----------------------------
64+
#
65+
# Here the bins are set to the list of edges [1, 1.5, 2, 2.5, 3, 3.5, 4].
66+
# This is half as wide as the previous example.
5467

5568
xbins = np.arange(1, 4.5, 0.5)
5669

57-
fig, ax = plt.subplots()
70+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
71+
5872
ax.hist(xdata, bins=xbins, **style)
59-
ax.plot(xdata, 0*xdata, 'd')
60-
ax.set_ylabel('Number per bin')
61-
ax.set_xlabel('x bins (dx=0.5)')
73+
74+
ax.eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
75+
76+
ax.set(ylabel='cpunt', xlabel='x bins (dx=0.5)',
77+
title='fixed bin edges: bins=np.arange(1, 4.5, .5)',)
6278

6379
# %%
80+
#
81+
# Set *bins* using number of bins
82+
# -------------------------------
83+
#
6484
# We can also let numpy (via Matplotlib) choose the bins automatically, or
6585
# specify a number of bins to choose automatically:
6686

67-
fig, ax = plt.subplot_mosaic([['auto', 'n4']],
68-
sharex=True, sharey=True, layout='constrained')
87+
fig, ax = plt.subplot_mosaic([['auto'], ['n4']],
88+
sharex=True, sharey=True,
89+
layout='constrained', figsize=(8, 4))
6990

7091
ax['auto'].hist(xdata, **style)
71-
ax['auto'].plot(xdata, 0*xdata, 'd')
72-
ax['auto'].set_ylabel('Number per bin')
73-
ax['auto'].set_xlabel('x bins (auto)')
92+
ax['auto'].eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
93+
94+
ax['auto'].set(ylabel='count', xlabel='x bins',
95+
title='dynamically computed bin edges: bins="auto"')
7496

7597
ax['n4'].hist(xdata, bins=4, **style)
76-
ax['n4'].plot(xdata, 0*xdata, 'd')
77-
ax['n4'].set_xlabel('x bins ("bins=4")')
98+
ax['n4'].eventplot(xdata, orientation='vertical', color='C1', alpha=.5)
99+
100+
ax['n4'].set(ylabel='count', xlabel='x bins',
101+
title='fixed number of bins: bins=4',)
78102

79103
# %%
80-
# Normalizing histograms: density and weight
81-
# ==========================================
104+
# Normalize histogram
105+
# ===================
82106
#
83107
# Counts-per-bin is the default length of each bar in the histogram. However,
84108
# we can also normalize the bar lengths as a probability density function using
85109
# the ``density`` parameter:
86110

87-
fig, ax = plt.subplots()
111+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
112+
88113
ax.hist(xdata, bins=xbins, density=True, **style)
89-
ax.set_ylabel('Probability density [$V^{-1}$])')
90-
ax.set_xlabel('x bins (dx=0.5 $V$)')
114+
115+
ax.set(ylabel='Probability density [$V^{-1}$])',
116+
xlabel='x bins (dx=0.5 $V$)',
117+
title='normalizing histogram using density')
91118

92119
# %%
93120
# This normalization can be a little hard to interpret when just exploring the
94121
# data. The value attached to each bar is divided by the total number of data
95-
# points *and* the width of the bin, and thus the values _integrate_ to one
122+
# points *and* the width of the bin, and thus the values *integrate* to one
96123
# when integrating across the full range of data.
97124
# e.g. ::
98125
#
@@ -117,127 +144,154 @@
117144
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)
118145

119146
# %%
147+
# *density* parameter
148+
# -------------------
149+
#
120150
# If we don't use ``density=True``, we need to scale the expected probability
121151
# distribution function by both the length of the data and the width of the
122152
# bins:
123153

124-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
125154
dx = 0.1
126155
xbins = np.arange(-4, 4, dx)
127-
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
128156

157+
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained',
158+
figsize=(8, 4))
159+
160+
161+
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
129162
# scale and plot the expected pdf:
130-
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
131-
ax['False'].set_ylabel('Count per bin')
132-
ax['False'].set_xlabel('x bins [V]')
133-
ax['False'].legend()
163+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$', alpha=.5)
164+
134165

135166
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
136-
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
137-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
138-
ax['True'].set_xlabel('x bins [$V$]')
167+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
168+
169+
170+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
171+
title="normalization using scaling, density=False")
172+
ax['False'].legend()
173+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
174+
title="density=True")
139175
ax['True'].legend()
140176

141177
# %%
142-
# One advantage of using the density is therefore that the shape and amplitude
143-
# of the histogram does not depend on the size of the bins. Consider an
144-
# extreme case where the bins do not have the same width. In this example, the
145-
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
178+
# Preserving distribution shape
179+
# -----------------------------
180+
# One advantage of using the density is that the shape and amplitude of the histogram
181+
# does not depend on the size of the bins.
182+
#
183+
# Irregularly spaced bins
184+
# ^^^^^^^^^^^^^^^^^^^^^^^
185+
# Consider an extreme case where the bins do not have the same width. In this example,
186+
# the bins below ``x=-1.25`` are six times wider than the rest of the bins. By
146187
# normalizing by density, we preserve the shape of the distribution, whereas if
147188
# we do not, then the wider bins have much higher counts than the thinner bins:
148189

149-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
150190
dx = 0.1
151191
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
192+
193+
fig, ax = plt.subplot_mosaic([['False', 'True']],
194+
layout='constrained', figsize=(8, 4))
195+
196+
152197
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
153-
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
154-
ax['False'].set_ylabel('Count per bin')
155-
ax['False'].set_xlabel('x bins [V]')
156-
ax['False'].legend()
198+
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$', alpha=.5)
157199

158200
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
159-
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
160-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
161-
ax['True'].set_xlabel('x bins [$V$]')
201+
ax['True'].plot(xpdf, pdf, label='$f_X(x)$', alpha=.5)
202+
203+
204+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [V]',
205+
title="irregularly spaced bins, density=False")
206+
ax['False'].legend()
207+
208+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
209+
title="irregularly spaced bins, density=True",)
162210
ax['True'].legend()
163211

164212
# %%
213+
# Histograms with different bin widths
214+
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
165215
# Similarly, if we want to compare histograms with different bin widths, we may
166216
# want to use ``density=True``:
167217

168-
fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
218+
fig, ax = plt.subplot_mosaic([['False', 'True']],
219+
layout='constrained', figsize=(8, 4))
169220

170221
# expected PDF
171222
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')
172223

173224
for nn, dx in enumerate([0.1, 0.4, 1.2]):
174225
xbins = np.arange(-4, 4, dx)
175226
# expected histogram:
176-
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
227+
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}', alpha=.5)
177228
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')
178229

179-
ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)
230+
ax['True'].hist(xdata, bins=xbins, density=True,
231+
histtype='step', label=dx, alpha=style['alpha'])
180232

181233
# Labels:
182-
ax['False'].set_xlabel('x bins [$V$]')
183-
ax['False'].set_ylabel('Count per bin')
184-
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
185-
ax['True'].set_xlabel('x bins [$V$]')
234+
ax['False'].set(ylabel='Count per bin', xlabel='x bins [$V$]',
235+
title="density=False")
236+
ax['True'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
237+
title='density=True')
186238
ax['True'].legend(fontsize='small', title='bin width:')
187239

188240
# %%
241+
# Assign weights
242+
# ==============
243+
#
189244
# Sometimes people want to normalize so that the sum of counts is one. This is
190245
# analogous to a `probability mass function
191246
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
192-
# variable where the sum of probabilities for all the values equals one. Using
193-
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
247+
# variable where the sum of probabilities for all the values equals one.
248+
#
249+
# *weights* parameter
250+
# -------------------
251+
# Using ``hist``, we can get this normalization if we set the *weights* to 1/N.
194252
# Note that the amplitude of this normalized histogram still depends on
195-
# width and/or number of the bins:
253+
# width and/or number of bins:
196254

197-
fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))
255+
fig, ax = plt.subplots(layout='constrained', figsize=(8, 4))
198256

199257
for nn, dx in enumerate([0.1, 0.4, 1.2]):
200258
xbins = np.arange(-4, 4, dx)
201259
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
202260
histtype='step', label=f'{dx}')
203-
ax.set_xlabel('x bins [$V$]')
204-
ax.set_ylabel('Bin count / N')
261+
262+
ax.set(ylabel='Bin count / N', xlabel='x bins [$V$]',
263+
title="histogram normalization using weights")
205264
ax.legend(fontsize='small', title='bin width:')
206265

207266
# %%
267+
# Populations of different sizes
268+
# ------------------------------
208269
# The value of normalizing histograms is comparing two distributions that have
209-
# different sized populations. Here we compare the distribution of ``xdata``
270+
# different sized populations. Here we compare the distribution of ``xdata``
210271
# with a population of 1000, and ``xdata2`` with 100 members.
211272

212273
xdata2 = rng.normal(size=100)
213274

214-
fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
215-
layout='constrained', figsize=(8, 4))
275+
fig, ax = plt.subplot_mosaic([['no_norm'], ['density'], ['weight']],
276+
layout='constrained', figsize=(8,2))
216277

217278
xbins = np.arange(-4, 4, 0.25)
218279

219-
ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
220-
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
221-
ax['no_norm'].set_ylabel('Counts')
222-
ax['no_norm'].set_xlabel('x bins [$V$]')
223-
ax['no_norm'].set_title('No normalization')
224-
225-
ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
226-
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
227-
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
228-
ax['density'].set_title('Density=True')
229-
ax['density'].set_xlabel('x bins [$V$]')
230-
231-
ax['weight'].hist(xdata, bins=xbins, histtype='step',
232-
weights=1 / len(xdata) * np.ones(len(xdata)),
233-
label='N=1000')
234-
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
235-
weights=1 / len(xdata2) * np.ones(len(xdata2)),
236-
label='N=100')
237-
ax['weight'].set_xlabel('x bins [$V$]')
238-
ax['weight'].set_ylabel('Counts / N')
280+
for xd in [xdata, xdata2]:
281+
ax['no_norm'].hist(xd, bins=xbins, histtype='step')
282+
ax['density'].hist(xd, bins=xbins, histtype='step', density=True)
283+
ax['weight'].hist(xd, bins=xbins, histtype='step',
284+
weights=1 / len(xd) * np.ones(len(xd)),
285+
label=f'N={len(xd)}')
286+
287+
288+
ax['no_norm'].set(ylabel='Counts', xlabel='x bins [$V$]',
289+
title='No normalization')
290+
ax['density'].set(ylabel='Probability density [$V^{-1}$]', xlabel='x bins [$V$]',
291+
title='Density=True')
292+
ax['weight'].set(ylabel='Counts / N', xlabel='x bins [$V$]',
293+
title='Weight = 1/N')
239294
ax['weight'].legend(fontsize='small')
240-
ax['weight'].set_title('Weight = 1/N')
241295

242296
plt.show()
243297

@@ -253,3 +307,4 @@
253307
# - `matplotlib.axes.Axes.set_xlabel`
254308
# - `matplotlib.axes.Axes.set_ylabel`
255309
# - `matplotlib.axes.Axes.legend`
310+
#

‎galleries/tutorials/index.rst

Copy file name to clipboardExpand all lines: galleries/tutorials/index.rst
+2Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ a :ref:`FAQ <faq-index>` in our :ref:`user guide <users-guide-index>`.
9595
/tutorials/images
9696
/tutorials/lifecycle
9797
/tutorials/artists
98+
/tutorials/histogram_normalization
9899

99100
.. only:: html
100101

@@ -134,6 +135,7 @@ Intermediate
134135
- :ref:`arranging_axes`
135136
- :ref:`autoscale`
136137
- :ref:`imshow_extent`
138+
- :ref:`histogram_normalization`
137139

138140
Advanced
139141
^^^^^^^^

0 commit comments

Comments
0 (0)
Morty Proxy This is a proxified and sanitized view of the page, visit original site.