diff --git a/.github/workflows/cache.yml b/.github/workflows/cache.yml
index 3a6d5b5..034f9a9 100644
--- a/.github/workflows/cache.yml
+++ b/.github/workflows/cache.yml
@@ -55,14 +55,14 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
       - name: Checkout QuantEcon theme
         uses: actions/checkout@v2
         with:
@@ -78,5 +78,6 @@ jobs:
       - name: Build Website files
         shell: bash -l {0}
         run: |
+          ls theme/lecture-python.theme
           make website THEMEPATH=theme/lecture-python.theme
           ls _build/website/jupyter_html/*
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1385420..dc98584 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -7,14 +7,14 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
       - name: Display Conda Environment Versions
         shell: bash -l {0}
         run: conda list
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index 2932d30..e73acc4 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -10,14 +10,14 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
       - name: Run Execution Tests
         shell: bash -l {0}
         run: make coverage
@@ -28,14 +28,14 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
       - name: Run Linkchecker
         shell: bash -l {0}
         run: make linkcheck
\ No newline at end of file
diff --git a/.github/workflows/pdf.yml b/.github/workflows/pdf.yml
index 9de309a..30d4437 100644
--- a/.github/workflows/pdf.yml
+++ b/.github/workflows/pdf.yml
@@ -28,14 +28,20 @@ jobs:
           source ~/.bash_profile
           xelatex --version
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
+      - name: Checkout QuantEcon theme
+        uses: actions/checkout@v2
+        with:
+          repository: QuantEcon/lecture-python.theme
+          token: ${{ secrets.ACTIONS_PAT }}
+          path: theme/lecture-python.theme
       - name: Build PDF
         shell: bash -l {0}
         run: |
@@ -43,6 +49,7 @@ jobs:
           echo 'export PATH=/tmp/texlive/bin/x86_64-linux:$PATH' >> ~/.bash_profile
           source ~/.bash_profile
           more ~/.bash_profile
+          ls theme/lecture-python.theme
           make pdf
       - uses: actions/upload-artifact@v2
         with:
diff --git a/.github/workflows/preview.yml b/.github/workflows/preview.yml
index a883918..a57f9f4 100644
--- a/.github/workflows/preview.yml
+++ b/.github/workflows/preview.yml
@@ -10,14 +10,14 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup Anaconda
-        uses: goanpeca/setup-miniconda@v1
+        uses: conda-incubator/setup-miniconda@v2
         with:
           auto-update-conda: true
           auto-activate-base: true
           miniconda-version: 'latest'
-          python-version: 3.7
+          python-version: 3.8
           environment-file: environment.yml
-          activate-environment: qe-lectures
+          activate-environment: lecture-python
       - name: Get Changed Files
         id: files
         uses: jitterbit/get-changed-files@v1
@@ -43,7 +43,7 @@ jobs:
         run: |
           bash scripts/build-website.sh "${{ steps.files.outputs.added_modified }}" "${{ github.event.pull_request.head.repo.full_name == github.repository }}"
       - name: Preview Deploy to Netlify
-        uses: nwtgck/actions-netlify@v1.0
+        uses: nwtgck/actions-netlify@v1.1
         if: env.BUILD_NETLIFY == 'true' && github.event.pull_request.head.repo.full_name == github.repository
         with:
           publish-dir: './_build/website/jupyter_html'
diff --git a/Makefile b/Makefile
index 18892b0..02e8414 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@ SHELL := bash
 # You can set these variables from the command line.
 SPHINXOPTS    = -c "./"
 SPHINXBUILD   = python -msphinx
-SPHINXPROJ    = lecture-python-programming
+SPHINXPROJ    = lecture-python
 SOURCEDIR     = source/rst
 BUILDDIR      = _build
 BUILDWEBSITE  = _build/website
@@ -57,10 +57,10 @@ endif
 website:
 	echo "Theme: $(THEMEPATH)"
 ifneq ($(strip $(parallel)),)
-	@$(SPHINXBUILD) -M jupyter "$(SOURCEDIR)" "$(BUILDWEBSITE)" $(FILES) $(SPHINXOPTS) $(O) -D jupyter_make_site=1 -D jupyter_generate_html=1 -D jupyter_download_nb=1 -D jupyter_execute_notebooks=1 -D jupyter_target_html=1 -D jupyter_download_nb_image_urlpath="https://s3-ap-southeast-2.amazonaws.com/python-programming.quantecon.org/_static/" -D jupyter_images_markdown=0 -D jupyter_theme_path="$(THEMEPATH)" -D jupyter_template_path="$(TEMPLATEPATH)" -D jupyter_html_template="html.tpl" -D jupyter_download_nb_urlpath="https://python-programming.quantecon.org/" -D jupyter_coverage_dir=$(BUILDCOVERAGE) -D jupyter_number_workers=$(parallel)
+	@$(SPHINXBUILD) -M jupyter "$(SOURCEDIR)" "$(BUILDWEBSITE)" $(FILES) $(SPHINXOPTS) $(O) -D jupyter_make_site=1 -D jupyter_generate_html=1 -D jupyter_download_nb=1 -D jupyter_execute_notebooks=1 -D jupyter_target_html=1 -D jupyter_download_nb_image_urlpath="https://s3-ap-southeast-2.amazonaws.com/python.quantecon.org/_static/" -D jupyter_images_markdown=0 -D jupyter_theme_path="$(THEMEPATH)" -D jupyter_template_path="$(TEMPLATEPATH)" -D jupyter_html_template="html.tpl" -D jupyter_download_nb_urlpath="https://python-programming.quantecon.org/" -D jupyter_coverage_dir=$(BUILDCOVERAGE) -D jupyter_number_workers=$(parallel)
 
 else
-	@$(SPHINXBUILD) -M jupyter "$(SOURCEDIR)" "$(BUILDWEBSITE)" $(FILES) $(SPHINXOPTS) $(O) -D jupyter_make_site=1 -D jupyter_generate_html=1 -D jupyter_download_nb=1 -D jupyter_execute_notebooks=1 -D jupyter_target_html=1 -D jupyter_download_nb_image_urlpath="https://s3-ap-southeast-2.amazonaws.com/python-programming.quantecon.org/_static/" -D jupyter_images_markdown=0 -D jupyter_theme_path="$(THEMEPATH)" -D jupyter_template_path="$(TEMPLATEPATH)" -D jupyter_html_template="html.tpl" -D jupyter_download_nb_urlpath="https://python-programming.quantecon.org/" -D jupyter_coverage_dir=$(BUILDCOVERAGE)
+	@$(SPHINXBUILD) -M jupyter "$(SOURCEDIR)" "$(BUILDWEBSITE)" $(FILES) $(SPHINXOPTS) $(O) -D jupyter_make_site=1 -D jupyter_generate_html=1 -D jupyter_download_nb=1 -D jupyter_execute_notebooks=1 -D jupyter_target_html=1 -D jupyter_download_nb_image_urlpath="https://s3-ap-southeast-2.amazonaws.com/python.quantecon.org/_static/" -D jupyter_images_markdown=0 -D jupyter_theme_path="$(THEMEPATH)" -D jupyter_template_path="$(TEMPLATEPATH)" -D jupyter_html_template="html.tpl" -D jupyter_download_nb_urlpath="https://python-programming.quantecon.org/" -D jupyter_coverage_dir=$(BUILDCOVERAGE)
 endif
 
 pdf:
diff --git a/environment.yml b/environment.yml
index df2d510..4957b11 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,26 +1,14 @@
-name: qe-lectures
+
+name: lecture-python
 channels:
   - default
-  - conda-forge
 dependencies:
+  - python=3.8
+  - anaconda=2020.07
   - pip
-  - python
-  - jupyter
-  - jupyterlab
-  - nbconvert
-  - pandoc
-  - pandas
-  - numba
-  - numpy
-  - matplotlib
-  - networkx
-  - sphinx=2.4.4
-  - scikit-learn
-  - statsmodels
-  - seaborn
   - pip:
+    - quantecon
     - interpolation
     - sphinxcontrib-jupyter
-    - sphinxcontrib-bibtex
-    - quantecon
+    - sphinxcontrib-bibtex==1.0
     - joblib
diff --git a/scripts/build-website.sh b/scripts/build-website.sh
index 9331fde..833dff2 100644
--- a/scripts/build-website.sh
+++ b/scripts/build-website.sh
@@ -15,10 +15,10 @@ done
 echo "List of Changed RST Files: $RST_FILES"
 echo "Building with Private theme: $PRIVATE_THEME"
 if [ -z "$RST_FILES" ]; then
-    echo "::set-env name=BUILD_NETLIFY::false"
+    echo "BUILD_NETLIFY=false" >> $GITHUB_ENV
     echo "No RST Files have changed -- nothing to do in this PR"
 else
-    echo "::set-env name=BUILD_NETLIFY::true"
+    echo "BUILD_NETLIFY=true" >> $GITHUB_ENV
     RST_FILES="$RST_FILES source/rst/index_toc.rst"
     if [ "$PRIVATE_THEME" = true ]; then
         make website THEMEPATH=theme/lecture-python.theme FILES="$RST_FILES"
diff --git a/source/_static/lecture_specific/optgrowth/cd_analytical.py b/source/_static/lecture_specific/optgrowth/cd_analytical.py
index ec713ca..3538114 100644
--- a/source/_static/lecture_specific/optgrowth/cd_analytical.py
+++ b/source/_static/lecture_specific/optgrowth/cd_analytical.py
@@ -1,4 +1,3 @@
-
 def v_star(y, α, β, μ):
     """
     True value function
@@ -13,5 +12,4 @@ def σ_star(y, α, β):
     """
     True optimal policy
     """
-    return (1 - α * β) * y
-
+    return (1 - α * β) * y
\ No newline at end of file
diff --git a/source/_static/lecture_specific/optgrowth_fast/ogm.py b/source/_static/lecture_specific/optgrowth_fast/ogm.py
index 8dccd96..63d72ee 100644
--- a/source/_static/lecture_specific/optgrowth_fast/ogm.py
+++ b/source/_static/lecture_specific/optgrowth_fast/ogm.py
@@ -1,4 +1,3 @@
-
 opt_growth_data = [
     ('α', float64),          # Production parameter
     ('β', float64),          # Discount factor
@@ -51,6 +50,4 @@ def u_prime(self, c):
 
     def u_prime_inv(self, c):
         "Inverse of u'"
-        return 1/c
-
-
+        return 1/c
\ No newline at end of file
diff --git a/source/_static/lecture_specific/optgrowth_fast/ogm_crra.py b/source/_static/lecture_specific/optgrowth_fast/ogm_crra.py
index 606c8c1..1bd22e4 100644
--- a/source/_static/lecture_specific/optgrowth_fast/ogm_crra.py
+++ b/source/_static/lecture_specific/optgrowth_fast/ogm_crra.py
@@ -1,5 +1,3 @@
-
-
 opt_growth_data = [
     ('α', float64),          # Production parameter
     ('β', float64),          # Discount factor
@@ -51,5 +49,4 @@ def u_prime(self, c):
         return c**(-self.γ)
 
     def u_prime_inv(c):
-        return c**(-1 / self.γ)
-
+        return c**(-1 / self.γ)
\ No newline at end of file
diff --git a/source/_static/quant-econ.bib b/source/_static/quant-econ.bib
index a2addb5..d96b494 100644
--- a/source/_static/quant-econ.bib
+++ b/source/_static/quant-econ.bib
@@ -790,7 +790,7 @@ @article{HarrisonKreps1979
 
 @article{HansenRichard1987,
   author={Hansen, Lars Peter and Richard, Scott F},
-  title={{The Role of Conditioning Information in Deducing Testable}},
+  title={The Role of Conditioning Information in Deducing Testable Restrictions Implied by Dynamc Asset Pricing Models},
   journal={Econometrica},
   year=1987,
   volume={55},
diff --git a/source/rst/aiyagari.rst b/source/rst/aiyagari.rst
index d9b6f31..4d7b291 100644
--- a/source/rst/aiyagari.rst
+++ b/source/rst/aiyagari.rst
@@ -16,7 +16,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ============
diff --git a/source/rst/ar1_processes.rst b/source/rst/ar1_processes.rst
index 4e340e2..1d96051 100644
--- a/source/rst/ar1_processes.rst
+++ b/source/rst/ar1_processes.rst
@@ -101,7 +101,7 @@ series :math:`\{ X_t\}`.
 
 To see this, we first note that :math:`X_t` is normally distributed for each :math:`t`.
 
-This is immediate form :eq:`ar1_ma`, since linear combinations of independent
+This is immediate from :eq:`ar1_ma`, since linear combinations of independent
 normal random variables are normal.
 
 Given that :math:`X_t` is normally distributed, we will know the full distribution
@@ -209,7 +209,7 @@ In fact it's easy to show that such convergence will occur, regardless of the in
 To see this, we just have to look at the dynamics of the first two moments, as
 given in :eq:`dyn_tm`.
 
-When :math:`|a| < 1`, these sequence converge to the respective limits
+When :math:`|a| < 1`, these sequences converge to the respective limits
 
 .. math::
     :label: mu_sig_star
diff --git a/source/rst/cake_eating_numerical.rst b/source/rst/cake_eating_numerical.rst
index a9f7c45..8a6c6dd 100644
--- a/source/rst/cake_eating_numerical.rst
+++ b/source/rst/cake_eating_numerical.rst
@@ -14,12 +14,9 @@ In addition to what's in Anaconda, this lecture will require the following libra
 
   !pip install interpolation
 
-
-
 Overview
 ========
 
-
 In this lecture we continue the study of :doc:`the cake eating problem
 <cake_eating_problem>`.
 
@@ -88,7 +85,8 @@ The basic idea is:
 
 2. Obtain an update :math:`w` defined by
 
-    .. math::
+   .. math::
+
         w(x) = \max_{0\leq c \leq x} \{u(c) + \beta v(x-c)\}
 
 3. Stop if :math:`w` is approximately equal to :math:`v`, otherwise set
@@ -100,7 +98,7 @@ The Bellman Operator
 --------------------
 
 We introduce the **Bellman operator** :math:`T` that takes a function `v` as an
-argument and returns a new function :math:`Tv` defined by.
+argument and returns a new function :math:`Tv` defined by
 
 .. math::
 
@@ -116,12 +114,10 @@ As we discuss in more detail in later lectures, one can use Banach's
 contraction mapping theorem to prove that the sequence of functions :math:`T^n
 v` converges to the solution to the Bellman equation.
 
-
-
 Fitted Value Function Iteration
 -------------------------------
 
-Both consumption :math:`c` and the state variable :math:`x` are continous.
+Both consumption :math:`c` and the state variable :math:`x` are continuous.
 
 This causes complications when it comes to numerical work.
 
@@ -138,18 +134,19 @@ The process looks like this:
 
 #. Begin with an array of values :math:`\{ v_0, \ldots, v_I \}`  representing
    the values of some initial function :math:`v` on the grid points :math:`\{ x_0, \ldots, x_I \}`.
+
 #. Build a function :math:`\hat v` on the state space :math:`\mathbb R_+` by
    linear interpolation, based on these data points.
+
 #. Obtain and record the value :math:`T \hat v(x_i)` on each grid point
    :math:`x_i` by repeatedly solving the maximization problem in the Bellman
    equation.
+
 #. Unless some stopping condition is satisfied, set
    :math:`\{ v_0, \ldots, v_I \} = \{ T \hat v(x_0), \ldots, T \hat v(x_I) \}` and go to step 2.
 
 In step 2 we'll use continuous piecewise linear interpolation.
 
-
-
 Implementation
 --------------
 
@@ -361,8 +358,6 @@ The reason is that the utility function and hence value function is very
 steep near the lower boundary, and hence hard to approximate.
 
 
-
-
 Policy Function
 ---------------
 
@@ -372,6 +367,7 @@ In the :doc:`first lecture on cake eating <cake_eating_problem>`, the optimal
 consumption policy was shown to be
 
 .. math::
+
     \sigma^*(x) = \left(1-\beta^{1/\gamma} \right) x
 
 Let's see if our numerical results lead to something similar.
@@ -379,6 +375,7 @@ Let's see if our numerical results lead to something similar.
 Our numerical strategy will be to compute
 
 .. math::
+
     \sigma(x) = \arg \max_{0 \leq c \leq x} \{u(c) + \beta v(x - c)\}
 
 on a grid of :math:`x` points and then interpolate.
@@ -433,7 +430,7 @@ Let's plot this next to the true analytical solution
     plt.show()
 
 
-The fit is reasoable but not perfect.
+The fit is reasonable but not perfect.
 
 We can improve it by increasing the grid size or reducing the
 error tolerance in the value function iteration routine.
@@ -498,8 +495,6 @@ This is due to
 
 
 
-
-
 Exercises
 =========
 
@@ -513,6 +508,7 @@ Instead of the cake size changing according to :math:`x_{t+1} = x_t - c_t`,
 let it change according to
 
 .. math::
+
     x_{t+1} = (x_t - c_t)^{\alpha}
 
 where :math:`\alpha` is a parameter satisfying :math:`0 < \alpha < 1`.
@@ -531,16 +527,14 @@ Implement time iteration, returning to the original case (i.e., dropping the
 modification in the exercise above).
 
 
-
-
 Solutions
-==========
+=========
 
 
 Exercise 1
------------
+----------
 
-We need to create a class to hold our primitives and return the right hand side of the bellman equation.
+We need to create a class to hold our primitives and return the right hand side of the Bellman equation.
 
 We will use `inheritance <https://en.wikipedia.org/wiki/Inheritance_(object-oriented_programming)>`__ to maximize code reuse.
 
@@ -613,15 +607,11 @@ the standard cake eating case :math:`\alpha=1`.
 
 Consumption is higher when :math:`\alpha < 1` because, at least for large :math:`x`, the return to savings is lower.
 
-
-
-
 Exercise 2
 ----------
 
 Here's one way to implement time iteration.
 
-
 .. code-block:: python3
 
     def K(σ_array, ce):
@@ -706,5 +696,3 @@ Here's one way to implement time iteration.
     ax.legend(fontsize=12)
 
     plt.show()
-
-
diff --git a/source/rst/cake_eating_problem.rst b/source/rst/cake_eating_problem.rst
index ad61b82..0b4da80 100644
--- a/source/rst/cake_eating_problem.rst
+++ b/source/rst/cake_eating_problem.rst
@@ -1,5 +1,3 @@
-
-
 .. highlight:: python3
 
 **********************************************
@@ -32,7 +30,7 @@ Readers might find it helpful to review the following lectures before reading th
 * The :doc:`shortest paths lecture <short_path>`
 * The :doc:`basic McCall model <mccall_model>`
 * The :doc:`McCall model with separation <mccall_model_with_separation>`
-* The :doc:`McCall model with separation and a continuous wage distribution <mccall_fitted_vfi>` 
+* The :doc:`McCall model with separation and a continuous wage distribution <mccall_fitted_vfi>`
 
 In what follows, we require the following imports:
 
@@ -46,7 +44,7 @@ In what follows, we require the following imports:
 
 
 The Model
-==================
+=========
 
 We consider an infinite time horizon :math:`t=0, 1, 2, 3..`
 
@@ -60,7 +58,8 @@ We choose how much of the cake to eat in any given period :math:`t`.
 After choosing to consume :math:`c_t` of the cake in period :math:`t` there is
 
 .. math::
-    x_{t+1} = x_t - c_t 
+
+    x_{t+1} = x_t - c_t
 
 left in period :math:`t+1`.
 
@@ -70,9 +69,11 @@ Consuming quantity :math:`c` of the cake gives current utility :math:`u(c)`.
 We adopt the CRRA utility function
 
 .. math::
-    u(c) = \frac{c^{1-\gamma}}{1-\gamma} \qquad (\gamma \gt 0, \, \gamma \neq 1)
     :label: crra_utility
 
+    u(c) = \frac{c^{1-\gamma}}{1-\gamma} \qquad (\gamma \gt 0, \, \gamma \neq 1)
+
+
 In Python this is
 
 .. code-block:: python3
@@ -89,16 +90,19 @@ In particular, consumption of :math:`c` units :math:`t` periods hence has presen
 The agent's problem can be written as
 
 .. math::
-    \max_{\{c_t\}} \sum_{t=0}^\infty \beta^t u(c_t)
     :label: cake_objective
 
+    \max_{\{c_t\}} \sum_{t=0}^\infty \beta^t u(c_t)
+
+
 subject to
 
 .. math::
-    x_{t+1} = x_t - c_t 
+    :label: cake_feasible
+
+    x_{t+1} = x_t - c_t
     \quad \text{and} \quad
     0\leq c_t\leq x_t
-    :label: cake_feasible
 
 for all :math:`t`.
 
@@ -169,9 +173,11 @@ the current time when :math:`x` units of cake are left.
 That is,
 
 .. math::
-    v(x) = \max \sum_{t=0}^{\infty} \beta^t u(c_t) 
     :label: value_fun
 
+    v(x) = \max \sum_{t=0}^{\infty} \beta^t u(c_t)
+
+
 where the maximization is over all paths :math:`\{ c_t \}` that are feasible
 from :math:`x_0 = x`.
 
@@ -181,7 +187,7 @@ make inferences about it.
 For example, as was the case with the :doc:`McCall model <mccall_model>`, the
 value function will satisfy a version of the *Bellman equation*.
 
-In the present case, this equation states that :math:`v` satisfies 
+In the present case, this equation states that :math:`v` satisfies
 
 .. math::
     :label: bellman-cep
@@ -214,9 +220,11 @@ It has been shown that, with :math:`u` as the CRRA utility function in
 :eq:`crra_utility`, the function
 
 .. math::
-    v^*(x_t) = \left( 1-\beta^{1/\gamma} \right)^{-\gamma}u(x_t)
     :label: crra_vstar
 
+    v^*(x_t) = \left( 1-\beta^{1/\gamma} \right)^{-\gamma}u(x_t)
+
+
 solves the Bellman equation and hence is equal to the value function.
 
 You are asked to confirm that this is true in the exercises below.
@@ -269,6 +277,7 @@ We should choose consumption to maximize the
 right hand side of the Bellman equation :eq:`bellman-cep`.
 
 .. math::
+
     c^* = \arg \max_{c} \{u(c) + \beta v(x - c)\}
 
 We can think of this optimal choice as a function of the state :math:`x`, in
@@ -277,16 +286,19 @@ which case we call it the **optimal policy**.
 We denote the optimal policy by :math:`\sigma^*`, so that
 
 .. math::
+
     \sigma^*(x) := \arg \max_{c} \{u(c) + \beta v(x - c)\}
     \quad \text{for all } x
 
 If we plug the analytical expression :eq:`crra_vstar` for the value function
-into the right hand side and compute the optimum, we find that 
+into the right hand side and compute the optimum, we find that
 
 .. math::
-    \sigma^*(x) = \left( 1-\beta^{1/\gamma} \right) x
     :label: crra_opt_pol
 
+    \sigma^*(x) = \left( 1-\beta^{1/\gamma} \right) x
+
+
 Now let's recall our intuition on the impact of parameters.
 
 We guessed that the consumption rate would be decreasing in both parameters.
@@ -338,14 +350,14 @@ provides key insights that are hard to obtain by other methods.
 Statement and Implications
 --------------------------
 
-The Euler equation for the present problem can be stated as 
+The Euler equation for the present problem can be stated as
 
 .. math::
     :label: euler-cep
 
     u^{\prime} (c^*_{t})=\beta u^{\prime}(c^*_{t+1})
 
-This is necessary condition for the optimal path.  
+This is necessary condition for the optimal path.
 
 It says that, along the optimal path, marginal rewards are equalized across time, after appropriate discounting.
 
@@ -383,7 +395,7 @@ In the exercises, you are asked to verify that the optimal policy
 For a proof of sufficiency of the Euler equation in a very general setting,
 see proposition 2.2 of :cite:`ma2020income`.
 
-The following arguments focus on necessity, explaining why an optimal path or 
+The following arguments focus on necessity, explaining why an optimal path or
 policy should satisfy the Euler equation.
 
 
@@ -397,7 +409,8 @@ Let's write :math:`c` as a shorthand for consumption path :math:`\{c_t\}_{t=0}^\
 The overall cake-eating maximization problem can be written as
 
 .. math::
-    \max_{c \in F} U(c) 
+
+    \max_{c \in F} U(c)
     \quad \text{where } U(c) := \sum_{t=0}^\infty \beta^t u(c_t)
 
 and :math:`F` is the set of feasible consumption paths.
@@ -408,6 +421,7 @@ So the optimal path :math:`c^* := \{c^*_t\}_{t=0}^\infty` must satisfy
 :math:`U'(c^*) = 0`.
 
 .. note::
+
     If you want to know exactly how the derivative :math:`U'(c^*)` is
     defined, given that the argument :math:`c^*` is a vector of infinite
     length, you can start by learning about `Gateaux derivatives
@@ -417,7 +431,7 @@ So the optimal path :math:`c^* := \{c^*_t\}_{t=0}^\infty` must satisfy
 In other words, the rate of change in :math:`U` must be zero for any
 infinitesimally small (and feasible) perturbation away from the optimal path.
 
-So consider a feasible perturbation that reduces consumption at time :math:`t` to 
+So consider a feasible perturbation that reduces consumption at time :math:`t` to
 :math:`c^*_t - h`
 and increases it in the next period to :math:`c^*_{t+1} + h`.
 
@@ -428,6 +442,7 @@ We call this perturbed path :math:`c^h`.
 By the preceding argument about zero gradients, we have
 
 .. math::
+
     \lim_{h \to 0} \frac{U(c^h) - U(c^*)}{h} = U'(c^*) = 0
 
 
@@ -435,21 +450,24 @@ Recalling that consumption only changes at :math:`t` and :math:`t+1`, this
 becomes
 
 .. math::
-    \lim_{h \to 0} 
-    \frac{\beta^t u(c^*_t - h) + \beta^{t+1} u(c^*_{t+1} + h) 
+
+    \lim_{h \to 0}
+    \frac{\beta^t u(c^*_t - h) + \beta^{t+1} u(c^*_{t+1} + h)
           - \beta^t u(c^*_t) - \beta^{t+1} u(c^*_{t+1}) }{h} = 0
 
 After rearranging, the same expression can be written as
 
 .. math::
-    \lim_{h \to 0} 
+
+    \lim_{h \to 0}
         \frac{u(c^*_t - h) - u(c^*_t) }{h}
-    + \lim_{h \to 0} 
+    + \lim_{h \to 0}
         \frac{ \beta u(c^*_{t+1} + h) - u(c^*_{t+1}) }{h} = 0
 
 or, taking the limit,
 
 .. math::
+
     - u'(c^*_t) + \beta u'(c^*_{t+1}) = 0
 
 This is just the Euler equation.
@@ -458,7 +476,7 @@ This is just the Euler equation.
 Derivation II: Using the Bellman Equation
 ------------------------------------------
 
-Another way to derive the Euler equation is to use the Bellman equation :eq:`bellman-cep`. 
+Another way to derive the Euler equation is to use the Bellman equation :eq:`bellman-cep`.
 
 Taking the derivative on the right hand side of the Bellman equation with
 respect to :math:`c` and setting it to zero, we get
@@ -470,7 +488,7 @@ respect to :math:`c` and setting it to zero, we get
 
 To obtain :math:`v^{\prime}(x - c)`, we set
 :math:`g(c,x) = u(c) + \beta v(x - c)`, so that, at the optimal choice of
-consumption, 
+consumption,
 
 .. math::
     :label: bellman_equality
@@ -481,24 +499,27 @@ Differentiating both sides while acknowledging that the maximizing consumption w
 on :math:`x`, we get
 
 .. math::
-    v' (x) = 
+
+    v' (x) =
     \frac{\partial }{\partial c} g(c,x) \frac{\partial c}{\partial x}
      + \frac{\partial }{\partial x} g(c,x)
-    
+
 
 When :math:`g(c,x)` is maximized at :math:`c`, we have :math:`\frac{\partial }{\partial c} g(c,x) = 0`.
 
 Hence the derivative simplifies to
 
 .. math::
-    v' (x) = 
+    :label: bellman_envelope
+
+    v' (x) =
     \frac{\partial g(c,x)}{\partial x}
     = \frac{\partial }{\partial x} \beta v(x - c)
     = \beta v^{\prime}(x - c)
-    :label: bellman_envelope
 
 
-(This argument is an example of the `Envelope Theorem <https://en.wikipedia.org/wiki/Envelope_theorem>`__.) 
+
+(This argument is an example of the `Envelope Theorem <https://en.wikipedia.org/wiki/Envelope_theorem>`__.)
 
 
 But now an application of :eq:`bellman_FOC` gives
@@ -545,9 +566,10 @@ Exercise 1
 -----------
 
 We start with the conjecture :math:`c_t^*=\theta x_t`, which leads to a path
-for the state variable (cake size) given by 
+for the state variable (cake size) given by
 
 .. math::
+
     x_{t+1}=x_t(1-\theta)
 
 Then :math:`x_t = x_{0}(1-\theta)^t` and hence
@@ -556,7 +578,7 @@ Then :math:`x_t = x_{0}(1-\theta)^t` and hence
 .. math::
 
     \begin{aligned}
-    v(x_0) 
+    v(x_0)
        & = \sum_{t=0}^{\infty} \beta^t u(\theta x_t)\\
        & = \sum_{t=0}^{\infty} \beta^t u(\theta x_0 (1-\theta)^t ) \\
        & = \sum_{t=0}^{\infty} \theta^{1-\gamma} \beta^t (1-\theta)^{t(1-\gamma)} u(x_0) \\
@@ -566,15 +588,16 @@ Then :math:`x_t = x_{0}(1-\theta)^t` and hence
 From the Bellman equation, then,
 
 .. math::
+
     \begin{aligned}
         v(x) & = \max_{0\leq c\leq x}
             \left\{
-                u(c) + 
+                u(c) +
                 \beta\frac{\theta^{1-\gamma}}{1-\beta(1-\theta)^{1-\gamma}}\cdot u(x-c)
             \right\} \\
              & = \max_{0\leq c\leq x}
                 \left\{
-                    \frac{c^{1-\gamma}}{1-\gamma} + 
+                    \frac{c^{1-\gamma}}{1-\gamma} +
                     \beta\frac{\theta^{1-\gamma}}
                     {1-\beta(1-\theta)^{1-\gamma}}
                     \cdot\frac{(x-c)^{1-\gamma}}{1-\gamma}
@@ -584,41 +607,48 @@ From the Bellman equation, then,
 From the first order condition, we obtain
 
 .. math::
+
     c^{-\gamma} + \beta\frac{\theta^{1-\gamma}}{1-\beta(1-\theta)^{1-\gamma}}\cdot(x-c)^{-\gamma}(-1) = 0
 
 or
 
 .. math::
+
     c^{-\gamma} = \beta\frac{\theta^{1-\gamma}}{1-\beta(1-\theta)^{1-\gamma}}\cdot(x-c)^{-\gamma}
 
 
 With :math:`c = \theta x` we get
 
 .. math::
+
     \left(\theta x\right)^{-\gamma} =  \beta\frac{\theta^{1-\gamma}}{1-\beta(1-\theta)^{1-\gamma}}\cdot(x(1-\theta))^{-
     \gamma}
 
 Some rearrangement produces
 
 .. math::
+
     \theta = 1-\beta^{\frac{1}{\gamma}}
 
 
 This confirms our earlier expression for the optimal policy:
 
 .. math::
+
     c_t^* = \left(1-\beta^{\frac{1}{\gamma}}\right)x_t
 
 
 Substituting :math:`\theta` into the value function above gives
 
 .. math::
+
     v^*(x_t) = \frac{\left(1-\beta^{\frac{1}{\gamma}}\right)^{1-\gamma}}
     {1-\beta\left(\beta^{\frac{{1-\gamma}}{\gamma}}\right)} u(x_t) \\
 
 Rearranging gives
 
 .. math::
+
     v^*(x_t) = \left(1-\beta^\frac{1}{\gamma}\right)^{-\gamma}u(x_t)
 
 
diff --git a/source/rst/career.rst b/source/rst/career.rst
index 5bac975..fb87927 100644
--- a/source/rst/career.rst
+++ b/source/rst/career.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ============
@@ -64,9 +64,8 @@ In what follows we distinguish between a career and a job, where
 For workers, wages can be decomposed into the contribution of job and career
 
 * :math:`w_t = \theta_t + \epsilon_t`, where
-
-    * :math:`\theta_t` is the contribution of career at time :math:`t`
-    * :math:`\epsilon_t` is the contribution of the job at time :math:`t`
+  * :math:`\theta_t` is the contribution of career at time :math:`t`
+  * :math:`\epsilon_t` is the contribution of the job at time :math:`t`
 
 At the start of time :math:`t`, a worker has the following options
 
diff --git a/source/rst/cass_koopmans_1.rst b/source/rst/cass_koopmans_1.rst
index fdad5eb..75c008e 100644
--- a/source/rst/cass_koopmans_1.rst
+++ b/source/rst/cass_koopmans_1.rst
@@ -14,14 +14,14 @@ Cass-Koopmans Planning Problem
 Overview
 =========
 
-This lecture and in :doc:`Cass-Koopmans Competitive Equilibrium <cass_koopmans_2>` describe a model that Tjalling Koopmans :cite:`Koopmans`
+This lecture and lecture :doc:`Cass-Koopmans Competitive Equilibrium <cass_koopmans_2>` describe a model that Tjalling Koopmans :cite:`Koopmans`
 and David Cass :cite:`Cass` used to analyze optimal growth.
 
 The model can be viewed as an extension of the model of Robert Solow
 described in `an earlier lecture <https://lectures.quantecon.org/py/python_oop.html>`__
 but adapted to make the saving rate the outcome of an optimal choice.
 
-(Solow assumed a constant saving rate determined outside the model).
+(Solow assumed a constant saving rate determined outside the model.)
 
 
 We describe two versions of the model, one in this lecture and the other in :doc:`Cass-Koopmans Competitive Equilibrium <cass_koopmans_2>`.
@@ -30,7 +30,7 @@ Together, the two lectures  illustrate what is, in fact, a
 more general connection between a **planned economy** and a decentralized economy
 organized as a **competitive equilibrium**.
 
-This lecture is devoted to the planned economy version.  
+This lecture is devoted to the planned economy version.
 
 The lecture uses important ideas including
 
@@ -46,7 +46,8 @@ Let's start with some standard imports:
 
 .. code-block:: ipython
 
-    from numba import njit, jitclass, float64
+    from numba import njit, float64
+    from numba.experimental import jitclass
     import numpy as np
     import matplotlib.pyplot as plt
     %matplotlib inline
@@ -110,18 +111,18 @@ capital stock.
 There is an economy-wide production function
 
 .. math::
-  :label: production-function
+    :label: production-function
 
-  F(K_t,N_t) = A K_t^{\alpha}N_t^{1-\alpha}
+    F(K_t,N_t) = A K_t^{\alpha}N_t^{1-\alpha}
 
 with :math:`0 < \alpha<1`, :math:`A > 0`.
 
 A feasible allocation :math:`\vec{C}, \vec{K}` satisfies
 
 .. math::
-  :label: allocation
+    :label: allocation
 
-  C_t + K_{t+1} \leq F(K_t,N_t) + (1-\delta) K_t, \quad \text{for all } t \in [0, T]
+    C_t + K_{t+1} \leq F(K_t,N_t) + (1-\delta) K_t, \quad \text{for all } t \in [0, T]
 
 where :math:`\delta \in (0,1)` is a depreciation rate of capital.
 
@@ -138,29 +139,29 @@ To find an optimal allocation, form a Lagrangian
 
 .. math::
 
-  \mathcal{L}(\vec{C} ,\vec{K} ,\vec{\mu} ) =
-  \sum_{t=0}^T \beta^t\left\{ u(C_t)+ \mu_t
-  \left(F(K_t,1) + (1-\delta) K_t- C_t - K_{t+1} \right)\right\}
+    \mathcal{L}(\vec{C} ,\vec{K} ,\vec{\mu} ) =
+    \sum_{t=0}^T \beta^t\left\{ u(C_t)+ \mu_t
+    \left(F(K_t,1) + (1-\delta) K_t- C_t - K_{t+1} \right)\right\}
 
 and then pose the following min-max problem:
 
 .. math::
-  :label: min-max-prob
+    :label: min-max-prob
 
-  \min_{\vec{\mu}} \max_{\vec{C},\vec{K}} \mathcal{L}(\vec{C},\vec{K},\vec{\mu} )
+    \min_{\vec{\mu}} \max_{\vec{C},\vec{K}} \mathcal{L}(\vec{C},\vec{K},\vec{\mu} )
 
 
 -  **Extremization** means
    maximization with respect to :math:`\vec{C}, \vec{K}` and
-   minimization with respect to :math:`\vec{\mu}`. 
+   minimization with respect to :math:`\vec{\mu}`.
+
 
- 
 -  Our problem satisfies
    conditions that assure that required second-order
    conditions are satisfied at an allocation that satisfies the
-   first-order conditions that we are about to compute.  
+   first-order conditions that we are about to compute.
 
-Before computing first-order conditions, we present some handy formulas. 
+Before computing first-order conditions, we present some handy formulas.
 
 
 
@@ -173,13 +174,13 @@ Notice that
 
 .. math::
 
-  F(K_t,N_t) = A K_t^\alpha N_t^{1-\alpha} = N_t A\left(\frac{K_t}{N_t}\right)^\alpha
+    F(K_t,N_t) = A K_t^\alpha N_t^{1-\alpha} = N_t A\left(\frac{K_t}{N_t}\right)^\alpha
 
 Define the **output per-capita production function**
 
 .. math::
 
-  \frac{F(K_t,N_t)}{N_t} \equiv f\left(\frac{K_t}{N_t}\right) = A\left(\frac{K_t}{N_t}\right)^\alpha
+    \frac{F(K_t,N_t)}{N_t} \equiv f\left(\frac{K_t}{N_t}\right) = A\left(\frac{K_t}{N_t}\right)^\alpha
 
 whose argument is **capital per-capita**.
 
@@ -188,34 +189,34 @@ It is useful to recall the following calculations for the marginal product of ca
 
 
 .. math::
-  :label: useful-calc1
-
-  \begin{aligned}
-  \frac{\partial F(K_t,N_t)}{\partial K_t}
-  & =
-  \frac{\partial N_t f\left( \frac{K_t}{N_t}\right)}{\partial K_t}
-  \\ &=
-  N_t f'\left(\frac{K_t}{N_t}\right)\frac{1}{N_t} \quad \text{(Chain rule)}
-  \\ &=
-  f'\left.\left(\frac{K_t}{N_t}\right)\right|_{N_t=1}
-  \\ &= f'(K_t)
-  \end{aligned}
+    :label: useful-calc1
+
+    \begin{aligned}
+    \frac{\partial F(K_t,N_t)}{\partial K_t}
+    & =
+    \frac{\partial N_t f\left( \frac{K_t}{N_t}\right)}{\partial K_t}
+    \\ &=
+    N_t f'\left(\frac{K_t}{N_t}\right)\frac{1}{N_t} \quad \text{(Chain rule)}
+    \\ &=
+    f'\left.\left(\frac{K_t}{N_t}\right)\right|_{N_t=1}
+    \\ &= f'(K_t)
+    \end{aligned}
 
 and the marginal product of labor
 
 .. math::
 
-  \begin{aligned}
-  \frac{\partial F(K_t,N_t)}{\partial N_t}
-  &=
-  \frac{\partial N_t f\left( \frac{K_t}{N_t}\right)}{\partial N_t} \quad \text{(Product rule)}
-  \\ &=
-  f\left(\frac{K_t}{N_t}\right){+} N_t f'\left(\frac{K_t}{N_t}\right) \frac{-K_t}{N_t^2} \quad \text{(Chain rule)}
-  \\ &=
-  f\left(\frac{K_t}{N_t}\right){-}\frac{K_t}{N_t}f'\left.\left(\frac{K_t}{N_t}\right)\right|_{N_t=1}
-  \\ &=
-  f(K_t) - f'(K_t) K_t
-  \end{aligned}
+    \begin{aligned}
+    \frac{\partial F(K_t,N_t)}{\partial N_t}
+    &=
+    \frac{\partial N_t f\left( \frac{K_t}{N_t}\right)}{\partial N_t} \quad \text{(Product rule)}
+    \\ &=
+    f\left(\frac{K_t}{N_t}\right){+} N_t f'\left(\frac{K_t}{N_t}\right) \frac{-K_t}{N_t^2} \quad \text{(Chain rule)}
+    \\ &=
+    f\left(\frac{K_t}{N_t}\right){-}\frac{K_t}{N_t}f'\left.\left(\frac{K_t}{N_t}\right)\right|_{N_t=1}
+    \\ &=
+    f(K_t) - f'(K_t) K_t
+    \end{aligned}
 
 
 First-order necessary conditions
@@ -245,9 +246,8 @@ We now compute **first order necessary conditions** for extremization of the Lag
     K_{T+1}: \qquad -\mu_T \leq 0, \ \leq 0 \text{ if } K_{T+1}=0; \ =0 \text{ if } K_{T+1}>0
 
 
-In computing  :eq:`constraint3` we recognize that 
-of :math:`K_t` appears in both the time  :math:`t` and time :math:`t-1`
-feasibility constraints.
+In computing  :eq:`constraint3` we recognize that :math:`K_t` appears
+in both the time  :math:`t` and time :math:`t-1` feasibility constraints.
 
 :eq:`constraint4` comes from differentiating with respect
 to :math:`K_{T+1}` and applying the following **Karush-Kuhn-Tucker condition** (KKT)
@@ -263,6 +263,7 @@ to :math:`K_{T+1}` and applying the following **Karush-Kuhn-Tucker condition** (
 Combining :eq:`constraint1` and :eq:`constraint2` gives
 
 .. math::
+
   u'\left(C_t\right)\left[(1-\delta)+f'\left(K_t\right)\right]-u'\left(C_{t-1}\right)=0
   \quad \text{ for all } t=1,2,\dots, T+1
 
@@ -278,16 +279,17 @@ Applying  the inverse of the utility function on both sides of the above
 equation gives
 
 .. math::
-  C_{t+1} =u'^{-1}\left(\left(\frac{\beta}{u'(C_t)}[f'(K_{t+1}) +(1-\delta)]\right)^{-1}\right)
+
+    C_{t+1} =u'^{-1}\left(\left(\frac{\beta}{u'(C_t)}[f'(K_{t+1}) +(1-\delta)]\right)^{-1}\right)
 
 which for our utility function :eq:`utility-oneperiod` becomes the consumption **Euler
 equation**
 
 .. math::
 
-  \begin{aligned} C_{t+1} =\left(\beta C_t^{\gamma}[f'(K_{t+1}) +
-  (1-\delta)]\right)^{1/\gamma} \notag\\= C_t\left(\beta [f'(K_{t+1}) +
-  (1-\delta)]\right)^{1/\gamma} \end{aligned}
+    \begin{aligned} C_{t+1} =\left(\beta C_t^{\gamma}[f'(K_{t+1}) +
+    (1-\delta)]\right)^{1/\gamma} \notag\\= C_t\left(\beta [f'(K_{t+1}) +
+    (1-\delta)]\right)^{1/\gamma} \end{aligned}
 
 Below we define a ``jitclass`` that stores parameters and functions
 that define our economy.
@@ -381,29 +383,29 @@ We use  **shooting** to compute an optimal allocation
 :math:`\vec{C}, \vec{K}` and an associated Lagrange multiplier sequence
 :math:`\vec{\mu}`.
 
-The first-order necessary conditions 
+The first-order necessary conditions
 :eq:`constraint1`, :eq:`constraint2`, and
 :eq:`constraint3`  for the planning problem form a system of **difference equations** with
 two boundary conditions:
 
--  :math:`K_0` is a given **initial condition** for capital
+- :math:`K_0` is a given **initial condition** for capital
 
--  :math:`K_{T+1} =0` is a **terminal condition** for capital that we
-   deduced from the first-order necessary condition for :math:`K_{T+1}`
-   the KKT condition :eq:`kkt`
+- :math:`K_{T+1} =0` is a **terminal condition** for capital that we
+  deduced from the first-order necessary condition for :math:`K_{T+1}`
+  the KKT condition :eq:`kkt`
 
 We have no initial condition for the Lagrange multiplier
 :math:`\mu_0`.
 
 If we did, our job would be easy:
 
--  Given :math:`\mu_0` and :math:`k_0`, we could compute :math:`c_0` from
-   equation :eq:`constraint1` and then :math:`k_1` from equation
-   :eq:`constraint3` and :math:`\mu_1` from equation
-   :eq:`constraint2`.
+- Given :math:`\mu_0` and :math:`k_0`, we could compute :math:`c_0` from
+  equation :eq:`constraint1` and then :math:`k_1` from equation
+  :eq:`constraint3` and :math:`\mu_1` from equation
+  :eq:`constraint2`.
 
--  We could continue in this way to compute the remaining elements of
-   :math:`\vec{C}, \vec{K}, \vec{\mu}`.
+- We could continue in this way to compute the remaining elements of
+  :math:`\vec{C}, \vec{K}, \vec{\mu}`.
 
 But we don't have an initial condition for :math:`\mu_0`, so this
 won't work.
@@ -417,24 +419,24 @@ It is  called the **shooting algorithm**.
 It is  an instance of a **guess and verify**
 algorithm that consists of the following steps:
 
--  Guess an initial Lagrange multiplier :math:`\mu_0`.
+- Guess an initial Lagrange multiplier :math:`\mu_0`.
 
--  Apply the **simple algorithm** described above.
+- Apply the **simple algorithm** described above.
 
--  Compute :math:`k_{T+1}` and check whether it
-   equals zero.
+- Compute :math:`k_{T+1}` and check whether it
+  equals zero.
 
--  If  :math:`K_{T+1} =0`, we have solved the problem.
+- If  :math:`K_{T+1} =0`, we have solved the problem.
 
--  If :math:`K_{T+1} > 0`, lower :math:`\mu_0` and try again.
+- If :math:`K_{T+1} > 0`, lower :math:`\mu_0` and try again.
 
--  If :math:`K_{T+1} < 0`, raise :math:`\mu_0` and try again.
+- If :math:`K_{T+1} < 0`, raise :math:`\mu_0` and try again.
 
 The following Python code implements the shooting algorithm for the
 planning problem.
 
-We actually modify the algorithm slightly by starting with a guess for 
-:math:`c_0` instead of :math:`\mu_0` in the following code. 
+We actually modify the algorithm slightly by starting with a guess for
+:math:`c_0` instead of :math:`\mu_0` in the following code.
 
 .. code-block:: python3
 
@@ -516,7 +518,7 @@ If :math:`K_{T+1}<0`, we take  it to be our new **upper** bound.
 Make a new guess for :math:`C_0` that is  halfway between our new
 upper and lower bounds.
 
-Shoot forward again, iterating on these steps until we converge.  
+Shoot forward again, iterating on these steps until we converge.
 
 When :math:`K_{T+1}` gets close enough to :math:`0` (i.e., within an error
 tolerance bounds), we stop.
@@ -616,29 +618,41 @@ Evalauating the feasibility constraint :eq:`allocation` at :math:`\bar K` gives
 Substituting :math:`K_t = \bar K` and :math:`C_t=\bar C` for
 all :math:`t` into :eq:`l12` gives
 
-.. math:: 1=\beta \frac{u'(\bar{C})}{u'(\bar{C})}[f'(\bar{K})+(1-\delta)]
+.. math::
+
+    1=\beta \frac{u'(\bar{C})}{u'(\bar{C})}[f'(\bar{K})+(1-\delta)]
 
 Defining :math:`\beta = \frac{1}{1+\rho}`, and cancelling gives
 
-.. math:: 1+\rho = 1[f'(\bar{K}) + (1-\delta)]
+.. math::
+
+    1+\rho = 1[f'(\bar{K}) + (1-\delta)]
 
 Simplifying gives
 
-.. math:: f'(\bar{K}) = \rho +\delta
+.. math::
+
+    f'(\bar{K}) = \rho +\delta
 
 and
 
-.. math:: \bar{K} = f'^{-1}(\rho+\delta)
+.. math::
+
+    \bar{K} = f'^{-1}(\rho+\delta)
 
 For the  production function :eq:`production-function` this becomes
 
-.. math:: \alpha \bar{K}^{\alpha-1} = \rho + \delta
+.. math::
+
+    \alpha \bar{K}^{\alpha-1} = \rho + \delta
 
 As an example, after setting :math:`\alpha= .33`,
 :math:`\rho = 1/\beta-1 =1/(19/20)-1 = 20/19-19/19 = 1/19`, :math:`\delta = 1/50`,
 we get
 
-.. math:: \bar{K} = \left(\frac{\frac{33}{100}}{\frac{1}{50}+\frac{1}{19}}\right)^{\frac{67}{100}} \approx 9.57583
+.. math::
+
+    \bar{K} = \left(\frac{\frac{33}{100}}{\frac{1}{50}+\frac{1}{19}}\right)^{\frac{67}{100}} \approx 9.57583
 
 Let's verify this with Python and then use this steady state
 :math:`\bar K` as our initial capital stock :math:`K_0`.
@@ -677,8 +691,8 @@ The following graphs compare optimal outcomes as we vary :math:`T`.
 
     plot_paths(pp, 0.3, k_ss/3, [150, 75, 50, 25], k_ss=k_ss);
 
-A Turnpike Property 
-======================
+A Turnpike Property
+===================
 
 The following calculation indicates that when  :math:`T` is very large,
 the optimal capital stock stays close to
@@ -688,7 +702,7 @@ its steady state value most of the time.
 
     plot_paths(pp, 0.3, k_ss/3, [250, 150, 50, 25], k_ss=k_ss);
 
-Different colors in the above graphs are associated
+Different colors in the above graphs are associated with
 different horizons :math:`T`.
 
 Notice that as the horizon increases, the planner puts :math:`K_t`
@@ -698,8 +712,8 @@ This pattern reflects a **turnpike** property of the steady state.
 
 A rule of thumb for the planner is
 
--  from :math:`K_0`, push :math:`K_t` toward
-   the steady state and stay close to the steady state until time approaches :math:`T`.
+- from :math:`K_0`, push :math:`K_t` toward
+  the steady state and stay close to the steady state until time approaches :math:`T`.
 
 
 The planner accomplishes this by adjusting the saving rate :math:`\frac{f(K_t) - C_t}{f(K_t)}`
@@ -784,7 +798,7 @@ state.
 Since :math:`K_0<\bar K`, :math:`f'(K_0)>\rho +\delta`.
 
 The planner chooses a positive saving rate that is higher than  the steady state
-saving rate. 
+saving rate.
 
 Note, :math:`f''(K)<0`, so as :math:`K` rises, :math:`f'(K)` declines.
 
@@ -792,13 +806,13 @@ The planner slowly lowers the saving rate until reaching a steady
 state in which :math:`f'(K)=\rho +\delta`.
 
 Exercise
----------
+--------
 
--  Plot the optimal consumption, capital, and saving paths when the
-   initial capital level begins at 1.5 times the steady state level
-   as we shoot towards the steady state at :math:`T=130`.
+- Plot the optimal consumption, capital, and saving paths when the
+  initial capital level begins at 1.5 times the steady state level
+  as we shoot towards the steady state at :math:`T=130`.
 
--  Why does the saving rate respond as it does?
+- Why does the saving rate respond as it does?
 
 Solution
 ----------
@@ -816,7 +830,7 @@ technology and preference structure as deployed here.
 In that lecture, we replace the  planner of this lecture with Adam Smith's **invisible hand**
 
 In place of quantity choices made by the planner, there are market prices somewhat produced by 
-the invisible hand. 
+the invisible hand.
 
 Market prices must adjust to reconcile distinct decisions that are made independently
 by a representative household and a representative firm.
diff --git a/source/rst/cass_koopmans_2.rst b/source/rst/cass_koopmans_2.rst
index d5d133a..1a853cf 100644
--- a/source/rst/cass_koopmans_2.rst
+++ b/source/rst/cass_koopmans_2.rst
@@ -5,14 +5,14 @@
 .. highlight:: python3
 
 
-**************************************
-Cass-Koopmans Competitive Equilibrium 
-**************************************
+*************************************
+Cass-Koopmans Competitive Equilibrium
+*************************************
 
 .. contents:: :depth: 2
 
 Overview
-=========
+========
 
 This lecture continues our analysis in this  lecture
 :doc:`Cass-Koopmans Planning Model <cass_koopmans_1>` about the  model that Tjalling Koopmans :cite:`Koopmans`
@@ -25,50 +25,48 @@ organized as a **competitive equilibrium**.
 The earlier  lecture :doc:`Cass-Koopmans Planning Model <cass_koopmans_1>` studied a planning problem and used ideas including
 
 
--  A min-max problem for solving the planning problem.
+- A min-max problem for solving the planning problem.
 
--  A **shooting algorithm** for solving difference equations subject
-   to initial and terminal conditions.
+- A **shooting algorithm** for solving difference equations subject
+  to initial and terminal conditions.
 
--  A **turnpike** property that describes optimal paths for
-   long-but-finite horizon economies.
+- A **turnpike** property that describes optimal paths for
+  long-but-finite horizon economies.
 
 The present lecture uses  additional  ideas including
 
--  Hicks-Arrow prices named after John R. Hicks and Kenneth Arrow.
+- Hicks-Arrow prices named after John R. Hicks and Kenneth Arrow.
 
--  A connection between some Lagrange multipliers in the min-max
-   problem and the Hicks-Arrow prices.
+- A connection between some Lagrange multipliers in the min-max
+  problem and the Hicks-Arrow prices.
 
--  A **Big** :math:`K` **, little** :math:`k` trick widely used in
-   macroeconomic dynamics.
+- A **Big** :math:`K` **, little** :math:`k` trick widely used in
+  macroeconomic dynamics.
 
+  * We shall encounter this trick in `this lecture <https://lectures.quantecon.org/py/rational_expectations.html#>`__
+    and also in `this lecture <https://lectures.quantecon.org/py/dyn_stack.html#>`__.
 
-  *  We shall encounter this trick in `this lecture <https://lectures.quantecon.org/py/rational_expectations.html#>`__
-     and also in `this lecture <https://lectures.quantecon.org/py/dyn_stack.html#>`__.
+- A non-stochastic version of a theory of the **term structure of
+  interest rates**.
 
--  A non-stochastic version of a theory of the **term structure of
-   interest rates**.     
-
--  An intimate connection between the cases for the optimality of two
-   competing visions of good ways to organize an economy, namely:
-
-  *  **socialism** in which a central planner commands the
-     allocation of resources, and
-
-  *  **capitalism** (also known as **a  market economy**) in
-     which competitive equilibrium **prices** induce individual
-     consumers and producers to choose a socially optimal allocation
-     as an unintended consequence of their selfish
-     decisions
+- An intimate connection between the cases for the optimality of two
+  competing visions of good ways to organize an economy, namely:
 
+  * **socialism** in which a central planner commands the
+    allocation of resources, and
 
+  * **capitalism** (also known as **a  market economy**) in
+    which competitive equilibrium **prices** induce individual
+    consumers and producers to choose a socially optimal allocation
+    as an unintended consequence of their selfish
+    decisions
 
 Let's start with some standard imports:
 
 .. code-block:: ipython
 
-    from numba import njit, jitclass, float64
+    from numba import njit, float64
+    from numba.experimental import jitclass
     import numpy as np
     import matplotlib.pyplot as plt
     %matplotlib inline
@@ -105,7 +103,7 @@ The representative household has preferences over consumption bundles
 ordered by the utility functional:
 
 .. math::
-    
+
     U(\vec{C}) = \sum_{t=0}^{T} \beta^t \frac{C_t^{1-\gamma}}{1-\gamma}
 
 where :math:`\beta \in (0,1)` is a discount factor and :math:`\gamma >0`
@@ -116,16 +114,16 @@ We assume that :math:`K_0 > 0`.
 There is an economy-wide production function
 
 .. math::
-  
-   F(K_t,N_t) = A K_t^{\alpha}N_t^{1-\alpha}
+
+    F(K_t,N_t) = A K_t^{\alpha}N_t^{1-\alpha}
 
 with :math:`0 < \alpha<1`, :math:`A > 0`.
 
 A feasible allocation :math:`\vec{C}, \vec{K}` satisfies
 
 .. math::
- 
-   C_t + K_{t+1} \leq F(K_t,N_t) + (1-\delta) K_t, \quad \text{for all } t \in [0, T]
+
+    C_t + K_{t+1} \leq F(K_t,N_t) + (1-\delta) K_t, \quad \text{for all } t \in [0, T]
 
 where :math:`\delta \in (0,1)` is a depreciation rate of capital.
 
@@ -138,7 +136,6 @@ maximize :eq:`utility-functional` subject to :eq:`allocation`.
 The allocation that solves the planning problem plays an important role in a competitive equilibrium as we shall see below.
 
 
-
 Competitive Equilibrium
 ========================
 
@@ -152,41 +149,39 @@ But now there is no planner.
 Market prices adjust to reconcile distinct decisions that are made
 separately by a representative household and a representative firm.
 
-
-
 There is a representative consumer who has the same preferences over
 consumption plans as did the consumer in the planned economy.
 
 Instead of being told what to consume and save by a planner, the
 household chooses for itself subject to a budget constraint
 
--  At each time :math:`t`, the household receives wages and rentals
-   of capital from a firm -- these comprise its **income** at
-   time :math:`t`.
+- At each time :math:`t`, the household receives wages and rentals
+  of capital from a firm -- these comprise its **income** at
+  time :math:`t`.
 
--  The consumer decides how much income to allocate to consumption or
-   to savings.
+- The consumer decides how much income to allocate to consumption or
+  to savings.
 
--  The household can save either by acquiring additional physical
-   capital (it trades one for one with time :math:`t` consumption)
-   or by acquiring claims on consumption at dates other
-   than :math:`t`.
+- The household can save either by acquiring additional physical
+  capital (it trades one for one with time :math:`t` consumption)
+  or by acquiring claims on consumption at dates other
+  than :math:`t`.
 
--  The household owns all physical capital and labor
-   and rents them to the firm.
+- The household owns all physical capital and labor
+  and rents them to the firm.
 
--  The household consumes, supplies labor, and invests in physical
-   capital.
+- The household consumes, supplies labor, and invests in physical
+  capital.
 
--  A profit-maximizing representative firm operates the production
-   technology.
+- A profit-maximizing representative firm operates the production
+  technology.
 
--  The firm rents labor and capital each period from the
-   representative household and sells its output each period to the
-   household.
+- The firm rents labor and capital each period from the
+  representative household and sells its output each period to the
+  household.
 
--  The representative household and the representative firm are both
-   **price takers** who believe that prices are not affected by their choices
+- The representative household and the representative firm are both
+  **price takers** who believe that prices are not affected by their choices
 
 **Note:** We can think of there being a large number
 :math:`M` of identical representative consumers and :math:`M`
@@ -194,8 +189,8 @@ identical representative firms.
 
 
 
-Market Structure 
-=================
+Market Structure
+================
 
 The representative household and the representative firm are both price takers.
 
@@ -209,13 +204,13 @@ all other dates :math:`t=1, 2, \ldots, T`.
 
 
 Prices
---------------
+------
 
 
-There are  sequences of prices 
+There are  sequences of prices
 :math:`\{w_t,\eta_t\}_{t=0}^T= \{\vec{w}, \vec{\eta} \}`
 where :math:`w_t` is a wage or rental rate for labor at time :math:`t` and
-:math:`eta_t` is a rental rate for capital at time :math:`t`.
+:math:`\eta_t` is a rental rate for capital at time :math:`t`.
 
 In addition there is are intertemporal prices that work as follows.
 
@@ -227,37 +222,42 @@ named after the 1972 economics Nobel prize winners.
 
 Evidently,
 
-.. math:: q^0_t=\frac{\text{number of time 0 goods}}{\text{number of time t goods}}
+.. math::
+
+    q^0_t=\frac{\text{number of time 0 goods}}{\text{number of time t goods}}
 
 Because :math:`q^0_t` is a **relative price**, the units in terms of
 which prices are quoted are arbitrary -- we are free to normalize them.
 
-
-
-
 Firm Problem
-==============
+============
 
 At time :math:`t` a representative firm hires labor
 :math:`\tilde n_t` and capital :math:`\tilde k_t`.
 
 The firm's profits at time :math:`t` are
 
-.. math:: F(\tilde k_t, \tilde n_t)-w_t \tilde n_t -\eta_t \tilde k_t
+.. math::
+
+    F(\tilde k_t, \tilde n_t)-w_t \tilde n_t -\eta_t \tilde k_t
 
 where :math:`w_t` is a wage rate at :math:`t`
 and :math:`\eta_t` is the rental rate on capital at :math:`t`.
 
 As in the planned economy model
 
-.. math:: F(\tilde k_t, \tilde n_t) = A \tilde k_t^\alpha \tilde n_t^{1-\alpha}
+.. math::
+
+    F(\tilde k_t, \tilde n_t) = A \tilde k_t^\alpha \tilde n_t^{1-\alpha}
 
 Zero Profit Conditions
------------------------
+----------------------
 
 Zero-profits condition for capital and labor are
 
-.. math:: F_k(\tilde k_t, \tilde n_t) =\eta_t
+.. math::
+
+    F_k(\tilde k_t, \tilde n_t) =\eta_t
 
 and
 
@@ -274,7 +274,9 @@ Euler about linearly homogenous functions.
 The theorem applies to the Cobb-Douglas production function because
 it assumed displays constant returns to scale:
 
-.. math:: \alpha F(\tilde k_t, \tilde n_t) =  F(\alpha  \tilde k_t, \alpha \tilde n_t)
+.. math::
+
+    \alpha F(\tilde k_t, \tilde n_t) =  F(\alpha  \tilde k_t, \alpha \tilde n_t)
 
 for :math:`\alpha \in (0,1)`.
 
@@ -284,22 +286,22 @@ above equation gives
 
 .. math::
 
-  F(\tilde k_t,\tilde n_t) =_\text{chain rule} \frac{\partial F}{\partial \tilde k_t}
-  \tilde k_t + \frac{\partial F}{\partial \tilde  n_t} \tilde n_t
+    F(\tilde k_t,\tilde n_t) =_\text{chain rule} \frac{\partial F}{\partial \tilde k_t}
+    \tilde k_t + \frac{\partial F}{\partial \tilde  n_t} \tilde n_t
 
 Rewrite the firm's profits as
 
 .. math::
 
-  \frac{\partial F}{\partial \tilde k_t} \tilde k_t +
-  \frac{\partial F}{\partial \tilde  n_t} \tilde n_t-w_t \tilde n_t -\eta_t k_t
+    \frac{\partial F}{\partial \tilde k_t} \tilde k_t +
+    \frac{\partial F}{\partial \tilde  n_t} \tilde n_t-w_t \tilde n_t -\eta_t k_t
 
 or
 
 .. math::
 
-  \left(\frac{\partial F}{\partial \tilde k_t}-\eta_t\right) \tilde k_t +
-  \left(\frac{\partial F}{\partial \tilde  n_t}-w_t\right) \tilde n_t
+    \left(\frac{\partial F}{\partial \tilde k_t}-\eta_t\right) \tilde k_t +
+    \left(\frac{\partial F}{\partial \tilde  n_t}-w_t\right) \tilde n_t
 
 Because :math:`F` is homogeneous of degree :math:`1`, it follows
 that :math:`\frac{\partial F}{\partial \tilde k_t}` and
@@ -326,19 +328,23 @@ It is convenient to define
 :math:`\vec{w} =\{w_0, \dots,w_T\}`\ and :math:`\vec{\eta}= \{\eta_0, \dots, \eta_T\}`.
 
 Household Problem
-===================
+=================
 
 A representative household lives at :math:`t=0,1,\dots, T`.
 
 At :math:`t`, the household rents :math:`1` unit of labor
 and :math:`k_t` units of capital to a firm and receives income
 
-.. math:: w_t 1+ \eta_t k_t
+.. math::
+
+    w_t 1+ \eta_t k_t
 
 At :math:`t` the household allocates its income to the following
 purchases
 
-.. math:: \left(c_t + (k_{t+1} -(1-\delta)k_t)\right)
+.. math::
+
+    \left(c_t + (k_{t+1} -(1-\delta)k_t)\right)
 
 Here :math:`\left(k_{t+1} -(1-\delta)k_t\right)` is the household's
 net investment in physical capital and :math:`\delta \in (0,1)` is
@@ -352,7 +358,9 @@ exceeds its purchases.
 A household's net excess demand for time :math:`t` consumption goods
 is the gap
 
-.. math:: e_t \equiv \left(c_t + (k_{t+1} -(1-\delta)k_t)\right)-(w_t 1 + \eta_t k_t)
+.. math::
+
+    e_t \equiv \left(c_t + (k_{t+1} -(1-\delta)k_t)\right)-(w_t 1 + \eta_t k_t)
 
 Let :math:`\vec{c} = \{c_0,\dots,c_T\}` and let :math:`\vec{k} = \{k_1,\dots,k_{T+1}\}`.
 
@@ -364,50 +372,52 @@ that states that the present value of the household's net excess
 demands must be zero:
 
 .. math::
+
     \sum_{t=0}^T q^0_t e_t  \leq 0
 
 or
 
-.. math:: \sum_{t=0}^T q^0_t  \left(c_t + (k_{t+1} -(1-\delta)k_t)-(w_t 1 + \eta_t k_t) \right) \leq 0
-
+.. math::
 
+    \sum_{t=0}^T q^0_t  \left(c_t + (k_{t+1} -(1-\delta)k_t)-(w_t 1 + \eta_t k_t) \right) \leq 0
 
-The household chooses an allocation to solve the constrained optimization problem:
 
-.. math:: \begin{aligned}& \max_{\vec{c}, \vec{k} }  \sum_{t=0}^T \beta^t u(c_t) \\ \text{subject to} \ \   & \sum_{t=0}^T q_t^0\left(c_t +\left(k_{t+1}-(1-\delta) k_t\right) -w_t -\eta_t k_t\right) \leq 0  \notag \end{aligned}
 
+The household chooses an allocation to solve the constrained optimization problem:
 
+.. math::
 
+    \begin{aligned}& \max_{\vec{c}, \vec{k} }  \sum_{t=0}^T \beta^t u(c_t) \\ \text{subject to} \ \   & \sum_{t=0}^T q_t^0\left(c_t +\left(k_{t+1}-(1-\delta) k_t\right) -w_t -\eta_t k_t\right) \leq 0  \notag \end{aligned}
 
 Definitions
-------------
+-----------
 
--  A **price system** is a sequence
-   :math:`\{q_t^0,\eta_t,w_t\}_{t=0}^T= \{\vec{q}, \vec{\eta}, \vec{w}\}`.
+- A **price system** is a sequence
+  :math:`\{q_t^0,\eta_t,w_t\}_{t=0}^T= \{\vec{q}, \vec{\eta}, \vec{w}\}`.
 
--  An **allocation** is a sequence
-   :math:`\{c_t,k_{t+1},n_t=1\}_{t=0}^T = \{\vec{c}, \vec{k}, \vec{n}\}`.
+- An **allocation** is a sequence
+  :math:`\{c_t,k_{t+1},n_t=1\}_{t=0}^T = \{\vec{c}, \vec{k}, \vec{n}\}`.
 
--  A **competitive equilibrium** is a price system and an allocation
-   for which
+- A **competitive equilibrium** is a price system and an allocation
+  for which
 
-   -  Given the price system, the allocation solves the household's
-      problem.
+  - Given the price system, the allocation solves the household's
+    problem.
 
-   -  Given the price system, the allocation solves the firm's
-      problem.
+  - Given the price system, the allocation solves the firm's
+    problem.
 
 Computing a Competitive Equilibrium
-====================================
+===================================
 
 We compute a competitive equilibrium by using a **guess and
 verify** approach.
 
--  We  **guess** equilibrium price sequences
-   :math:`\{\vec{q}, \vec{\eta}, \vec{w}\}`.
+- We  **guess** equilibrium price sequences
+  :math:`\{\vec{q}, \vec{\eta}, \vec{w}\}`.
 
--  We then **verify** that at those prices, the household and
-   the firm choose the same allocation.
+- We then **verify** that at those prices, the household and
+  the firm choose the same allocation.
 
 Guess for Price System
 -----------------------
@@ -415,7 +425,7 @@ Guess for Price System
 In this lecture :doc:`Cass-Koopmans Planning Model <cass_koopmans_1>`, we  computed an allocation :math:`\{\vec{C}, \vec{K}, \vec{N}\}`
 that solves the planning problem.
 
-(This allocation will constitute the **Big** :math:`K`  to be in the presence instance of the *Big** :math:`K` **, little** :math:`k` trick
+(This allocation will constitute the **Big** :math:`K`  to be in the present instance of the **Big** :math:`K` **, little** :math:`k` trick
 that we'll apply to  a competitive equilibrium in the spirit of `this lecture <https://lectures.quantecon.org/py/rational_expectations.html#>`__
 and  `this lecture <https://lectures.quantecon.org/py/dyn_stack.html#>`__.)
 
@@ -449,7 +459,9 @@ At these prices, let the capital chosen by the household be
 
 and let the allocation chosen by the firm be
 
-.. math:: \tilde k^*_t(\vec{q}, \vec{w}, \vec{\eta}), \quad t \geq 0
+.. math::
+
+    \tilde k^*_t(\vec{q}, \vec{w}, \vec{\eta}), \quad t \geq 0
 
 and so on.
 
@@ -481,7 +493,7 @@ the planning problem:
     k^*_t = \tilde k^*_t=K_t, \tilde n_t=1, c^*_t=C_t
 
 Verification Procedure
------------------------
+----------------------
 
 Our approach is to stare at first-order necessary conditions for the
 optimization problems of the household and the firm.
@@ -491,9 +503,9 @@ conditions are satisfied at the allocation that solves the planning
 problem.
 
 Household's Lagrangian
-------------------------
+----------------------
 
-To solve the household's problem, we formulate the Lagrangian 
+To solve the household's problem, we formulate the Lagrangian
 
 .. math::
 
@@ -501,9 +513,9 @@ To solve the household's problem, we formulate the Lagrangian
     +\eta_t k_t -c_t  - k_{t+1}\right)\right)
 
 
-and attack the min-max problem:  
+and attack the min-max problem:
 
-.. math:: 
+.. math::
 
     \min_{\lambda} \max_{\vec{c},\vec{k}}  \mathcal{L}(\vec{c},\vec{k},\lambda)
 
@@ -537,7 +549,9 @@ Now we plug in our guesses of prices and embark on some algebra in the hope of d
 
 Combining :eq:`cond1` and :eq:`eq-price`, we get:
 
-.. math:: u'(C_t) = \mu_t
+.. math::
+
+    u'(C_t) = \mu_t
 
 which is :eq:`constraint1`.
 
@@ -552,11 +566,15 @@ Combining :eq:`cond2`, :eq:`eq-price`, and
 Rewriting :eq:`co-re` by dividing by :math:`\lambda` on
 both sides (which is nonzero since  u'>0) we get:
 
-.. math:: \beta^t \mu_t [(1-\delta+f'(K_t)] = \beta^{t-1} \mu_{t-1}
+.. math::
+
+    \beta^t \mu_t [(1-\delta+f'(K_t)] = \beta^{t-1} \mu_{t-1}
 
 or
 
-.. math:: \beta \mu_t [(1-\delta+f'(K_t)] = \mu_{t-1}
+.. math::
+
+    \beta \mu_t [(1-\delta+f'(K_t)] = \mu_{t-1}
 
 which is :eq:`constraint2`.
 
@@ -564,25 +582,35 @@ Combining :eq:`cond3`, :eq:`eq-price`, :eq:`eq-price2`
 and :eq:`eq-price3` after multiplying both sides of
 :eq:`cond3` by :math:`\lambda`, we get
 
-.. math:: \sum_{t=0}^T \beta^t \mu_{t} \left(C_t+ (K_{t+1} -(1-\delta)K_t)-f(K_t)+K_t f'(K_t)-f'(K_t)K_t\right) \leq 0
+.. math::
+
+    \sum_{t=0}^T \beta^t \mu_{t} \left(C_t+ (K_{t+1} -(1-\delta)K_t)-f(K_t)+K_t f'(K_t)-f'(K_t)K_t\right) \leq 0
 
 which simplifies
 
-.. math:: \sum_{t=0}^T  \beta^t \mu_{t} \left(C_t +K_{t+1} -(1-\delta)K_t - F(K_t,1)\right) \leq 0
+.. math::
+
+    \sum_{t=0}^T  \beta^t \mu_{t} \left(C_t +K_{t+1} -(1-\delta)K_t - F(K_t,1)\right) \leq 0
+
+Since :math:`\beta^t \mu_t >0` for :math:`t =0, \ldots, T`, it follows that
 
-Since :math:`\beta^t \mu_t >0` for :math:`t =0, \ldots, T`, it follows that 
+.. math::
 
-.. math:: C_t+K_{t+1}-(1-\delta)K_t -F(K_t,1)=0 \quad  \text{ for all }t \text{ in } 0,\dots,T
+    C_t+K_{t+1}-(1-\delta)K_t -F(K_t,1)=0 \quad  \text{ for all }t \text{ in } 0,\dots,T
 
 which is :eq:`constraint3`.
 
 Combining :eq:`cond4` and :eq:`eq-price`, we get:
 
-.. math:: - \beta^{T+1} \mu_{T+1} \leq 0
+.. math::
+
+    -\beta^{T+1} \mu_{T+1} \leq 0
 
 Dividing both sides by :math:`\beta^{T+1}` gives
 
-.. math:: -\mu_{T+1} \leq 0
+.. math::
+
+    -\mu_{T+1} \leq 0
 
 which is  :eq:`constraint4` for the  planning problem.
 
@@ -593,19 +621,23 @@ representative household living in a competitive equilibrium.
 We now turn to  the problem faced by a firm in a competitive
 equilibrium:
 
-If we plug in :eq:`eq-pl` into :eq:`Zero-profits` for all t, we
+If we plug :eq:`eq-pl` into :eq:`Zero-profits` for all t, we
 get
 
-.. math:: \frac{\partial F(K_t, 1)}{\partial K_t} = f'(K_t) = \eta_t
+.. math::
+
+    \frac{\partial F(K_t, 1)}{\partial K_t} = f'(K_t) = \eta_t
 
-\ which is :eq:`eq-price3`.
+which is :eq:`eq-price3`.
 
 If we now plug :eq:`eq-pl` into :eq:`Zero-profits` for all t, we
 get:
 
-.. math:: \frac{\partial F(\tilde K_t, 1)}{\partial \tilde L_t} = f(K_t)-f'(K_t)K_t=w_t
+.. math::
+
+    \frac{\partial F(\tilde K_t, 1)}{\partial \tilde L_t} = f(K_t)-f'(K_t)K_t=w_t
 
-\ which is exactly :eq:`eq-pr4`.
+which is exactly :eq:`eq-pr4`.
 
 So at our guess for the equilibrium price system, the allocation
 that solves the planning problem also solves the problem faced by a firm
@@ -622,12 +654,10 @@ positive constant.
 makes :math:`\lambda=1` so that we are measuring :math:`q_0^t`  in
 units of the marginal utility of time :math:`0` goods.
 
-
 We will  plot :math:`q, w, \eta` below to show these equilibrium  prices 
 induce the same aggregate movements that we saw earlier in the planning
 problem.
 
-
 To proceed, we bring in Python code that :doc:`Cass-Koopmans Planning Model <cass_koopmans_1>` used to solve the planning problem
 
 First let's define a ``jitclass`` that stores  parameters and functions
@@ -831,9 +861,9 @@ Now we calculate and plot for each :math:`T`
         for i, ax in enumerate(axs.flatten()):
             ax.plot(paths[i])
             ax.set(title=titles[i], ylabel=ylabels[i], xlabel='t')
-            if titles[i] is 'Capital':
+            if titles[i] == 'Capital':
                 ax.axhline(k_ss, lw=1, ls='--', c='k')
-            if titles[i] is 'Consumption':
+            if titles[i] == 'Consumption':
                 ax.axhline(c_ss, lw=1, ls='--', c='k')
 
     plt.tight_layout()
@@ -842,7 +872,6 @@ Now we calculate and plot for each :math:`T`
 Varying Curvature
 ^^^^^^^^^^^^^^^^^^
 
-
 Now we see how our results change if we keep :math:`T` constant, but allow
 the curvature parameter, :math:`\gamma` to vary, starting
 with :math:`K_0` below the steady state.
@@ -869,9 +898,9 @@ We plot the results for :math:`T=150`
         for i, ax in enumerate(axs.flatten()):
             ax.plot(paths[i], label=f'$\gamma = {γ}$')
             ax.set(title=titles[i], ylabel=ylabels[i], xlabel='t')
-            if titles[i] is 'Capital':
+            if titles[i] == 'Capital':
                 ax.axhline(k_ss, lw=1, ls='--', c='k')
-            if titles[i] is 'Consumption':
+            if titles[i] == 'Consumption':
                 ax.axhline(c_ss, lw=1, ls='--', c='k')
 
     axs[0, 0].legend()
@@ -887,12 +916,12 @@ resulting in slower adjustments to the steady state allocations.
 Vice-versa for lower :math:`\gamma`.
 
 
-Yield Curves and Hicks-Arrow Prices 
-==========================================
+Yield Curves and Hicks-Arrow Prices
+===================================
 
 We return to  Hicks-Arrow prices and  calculate how they are related to  **yields**  on loans of alternative maturities.
 
-This will let us plot a **yield curve** that graphs   yields  on bonds of  maturities :math:`j=1, 2, \ldots` against :math:j=1,2, \ldots`. 
+This will let us plot a **yield curve** that graphs   yields  on bonds of  maturities :math:`j=1, 2, \ldots` against :math:`j=1,2, \ldots`. 
 
 The formulas we want are:
 
@@ -962,4 +991,4 @@ Now we plot when :math:`t_0=20`
     plot_yield_curves(pp, 20, 0.3, k_ss/3, T_arr)
 
 We aim to have more to say about the term structure of interest rates
-in a planned lecture on the topic. 
+in a planned lecture on the topic.
diff --git a/source/rst/coleman_policy_iter.rst b/source/rst/coleman_policy_iter.rst
index 7e8b382..7304ad6 100644
--- a/source/rst/coleman_policy_iter.rst
+++ b/source/rst/coleman_policy_iter.rst
@@ -13,7 +13,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 
@@ -54,7 +54,8 @@ Let's start with some imports:
 
     from interpolation import interp
     from quantecon.optimize import brentq
-    from numba import njit, jitclass, float64
+    from numba import njit, float64
+    from numba.experimental import jitclass
 
 
 The Euler Equation
diff --git a/source/rst/complex_and_trig.rst b/source/rst/complex_and_trig.rst
index 8a57deb..50908c6 100644
--- a/source/rst/complex_and_trig.rst
+++ b/source/rst/complex_and_trig.rst
@@ -305,7 +305,7 @@ condition:
     print(f'ω = {ω:1.3f}')
 
     # Solve for p
-    eq2 = Eq(x0 - 2 * p * cos(ω))
+    eq2 = Eq(x0 - 2 * p * cos(ω), 0)
     p = nsolve(eq2, p, 0)
     p = np.float(p)
     print(f'p = {p:1.3f}')
diff --git a/source/rst/egm_policy_iter.rst b/source/rst/egm_policy_iter.rst
index 8e9430d..e2cbb52 100644
--- a/source/rst/egm_policy_iter.rst
+++ b/source/rst/egm_policy_iter.rst
@@ -13,7 +13,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 Overview
@@ -39,7 +39,8 @@ Let's start with some standard imports:
     import numpy as np
     import quantecon as qe
     from interpolation import interp
-    from numba import jitclass, njit, float64
+    from numba import njit, float64
+    from numba.experimental import jitclass
     from quantecon.optimize import brentq
     import matplotlib.pyplot as plt
     %matplotlib inline
diff --git a/source/rst/exchangeable.rst b/source/rst/exchangeable.rst
index d0a4b8c..fd9f132 100644
--- a/source/rst/exchangeable.rst
+++ b/source/rst/exchangeable.rst
@@ -15,19 +15,19 @@ Exchangeability and Bayesian Updating
 Overview
 =========
 
-This lecture studies an example  of learning 
+This lecture studies an example  of learning
 via Bayes' Law.
 
 We touch on foundations of Bayesian statistical inference invented by Bruno DeFinetti :cite:`definetti`.
 
-The relevance of DeFinetti's work for economists is presented forcefully 
+The relevance of DeFinetti's work for economists is presented forcefully
 in chapter 11 of :cite:`Kreps88` by David Kreps.
 
-The example  that we study in this lecture  is a key component of :doc:`this lecture <odu>` that augments the 
+The example  that we study in this lecture  is a key component of :doc:`this lecture <odu>` that augments the
 :doc:`classic  <mccall_model>`  job search model of McCall
-:cite:`McCall1970` by presenting an unemployed worker with a statistical inference problem. 
+:cite:`McCall1970` by presenting an unemployed worker with a statistical inference problem.
 
-Here we create  graphs that illustrate the role that  a  likelihood ratio  
+Here we create  graphs that illustrate the role that  a  likelihood ratio
 plays in  Bayes' Law.
 
 We'll use such graphs to provide insights into the mechanics driving outcomes in :doc:`this lecture <odu>` about learning in an augmented McCall job
@@ -40,18 +40,18 @@ that are
 
 - exchangeable
 
-Understanding the distinction between these concepts is essential for appreciating how Bayesian updating 
+Understanding the distinction between these concepts is essential for appreciating how Bayesian updating
 works in our example.
 
-You can read about exchangeability `here <https://en.wikipedia.org/wiki/Exchangeable_random_variables>`__
+You can read about exchangeability `here <https://en.wikipedia.org/wiki/Exchangeable_random_variables>`__.
 
 
 
-Below, we'll often use 
+Below, we'll often use
 
 - :math:`W` to denote a random variable
 
-- :math:`w` to denote a particular realization of a random variable :math:`W` 
+- :math:`w` to denote a particular realization of a random variable :math:`W`
 
 Let’s start with some imports:
 
@@ -67,7 +67,7 @@ Let’s start with some imports:
     %matplotlib inline
 
 
-Independently and Identically Distributed 
+Independently and Identically Distributed
 ==========================================
 
 We begin by looking at the notion of an  **independently and identically  distributed sequence** of random variables.
@@ -77,74 +77,72 @@ An independently and identically distributed sequence is often abbreviated as II
 Two notions are involved, **independently** and **identically** distributed.
 
 A sequence :math:`W_0, W_1, \ldots` is **independently distributed** if the joint probability density
-of the sequence is the **product** of the densities of the  components of the sequence. 
+of the sequence is the **product** of the densities of the  components of the sequence.
 
 The sequence :math:`W_0, W_1, \ldots` is **independently and identically distributed** if in addition the marginal
-density of :math:`W_t` is the same for all :math:`t =0, 1, \ldots`.  
+density of :math:`W_t` is the same for all :math:`t =0, 1, \ldots`.
 
 For example,  let :math:`p(W_0, W_1, \ldots)` be the **joint density** of the sequence and
 let :math:`p(W_t)` be the **marginal density** for a particular :math:`W_t` for all :math:`t =0, 1, \ldots`.
 
-Then the joint density of the sequence :math:`W_0, W_1, \ldots` is IID if 
+Then the joint density of the sequence :math:`W_0, W_1, \ldots` is IID if
 
-.. math:: p(W_0, W_1, \ldots) =  p(W_0) p(W_1) \cdots 
+.. math:: p(W_0, W_1, \ldots) =  p(W_0) p(W_1) \cdots
 
 so that the joint density is the product of a sequence of identical marginal densities.
 
 
-IID Means Past Observations Don't Tell Us Anything About Future Observations 
+IID Means Past Observations Don't Tell Us Anything About Future Observations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If a sequence is random variables is IID, past information provides no information about future realizations.
 
-In this sense, there is **nothing to learn**  about the future from the past.  
-
-
+In this sense, there is **nothing to learn**  about the future from the past.
 
 To understand these statements, let the joint distribution of a sequence of random variables :math:`\{W_t\}_{t=0}^T`
 that is not necessarily IID, be
 
-.. math::  p(W_T, W_{T-1}, \ldots, W_1, W_0)
+.. math::
+
+    p(W_T, W_{T-1}, \ldots, W_1, W_0)
 
 
 Using the laws of probability, we can always factor such a joint density into a product of conditional densities:
 
 .. math::
 
-   \begin{align}
-     p(W_T, W_{T-1}, \ldots, W_1, W_0)    = & p(W_T | W_{t-1}, \ldots, W_0) p(W_{T-1} | W_{T-2}, \ldots, W_0) \cdots  \cr
-     & p(W_1 | W_0) p(W_0) 
-   \end{align}
+   \begin{aligned}
+     p(W_T, W_{T-1}, \ldots, W_1, W_0)    = & p(W_T | W_{T-1}, \ldots, W_0) p(W_{T-1} | W_{T-2}, \ldots, W_0) \cdots  \cr
+     & \quad \quad \cdots p(W_1 | W_0) p(W_0)
+   \end{aligned}
 
 
 
-In general,   
+In general,
 
-.. math::  p(W_t | W_{t-1}, \ldots, W_0)   \neq   p(W_t)             
+.. math::  p(W_t | W_{t-1}, \ldots, W_0)   \neq   p(W_t)
 
 which states that the **conditional density** on the left side does not equal the **marginal density** on the right side.
 
-In the special IID case, 
+In the special IID case,
 
 .. math::  p(W_t | W_{t-1}, \ldots, W_0)   =  p(W_t)
 
 and partial history :math:`W_{t-1}, \ldots, W_0` contains no information about the probability of :math:`W_t`.
 
-So in the IID case, there is **nothing to learn** about the densities of future random variables from past data.   
- 
-In the general case, there is something go learn from past data.
+So in the IID case, there is **nothing to learn** about the densities of future random variables from past data.
+
+In the general case, there is something to learn from past data.
 
-We turn next to an instance of this general case in which there is something to learn from past data. 
+We turn next to an instance of this general case.
 
-Please keep your eye out for **what** there is to learn from past data. 
+Please keep your eye out for **what** there is to learn from past data.
 
 
 
 A Setting in Which Past Observations Are Informative
 =====================================================
 
-We now turn to a setting in which there **is** something to learn.  
-
 Let :math:`\{W_t\}_{t=0}^\infty` be a sequence of nonnegative
 scalar random variables with a joint probability distribution
 constructed as follows.
@@ -162,50 +160,47 @@ So  the data are permanently generated as independently and identically distribu
 :math:`G`.
 
 We could say that *objectively* the probability that the data are generated as draws from :math:`F` is either :math:`0`
-or :math:`1`.  
+or :math:`1`.
 
-We now drop into this setting a decision maker who knows :math:`F` and :math:`G` and that nature picked one 
+We now drop into this setting a decision maker who knows :math:`F` and :math:`G` and that nature picked one
 of them once and for all and then drew an IID sequence of draws from that distribution.
 
-But our decision maker does not know which of the two distributions nature selected.  
+But our decision maker does not know which of the two distributions nature selected.
 
-The decision maker summarizes his ignorance about this by picking a **subjective probability** 
+The decision maker summarizes his ignorance with a **subjective probability**
 :math:`\tilde \pi` and reasons as if  nature had selected :math:`F` with probability
 :math:`\tilde \pi \in (0,1)` and
 :math:`G` with probability :math:`1 - \tilde \pi`.
 
 Thus, we  assume that the decision maker
 
- - **knows** both :math:`F` and :math:`G`
+- **knows** both :math:`F` and :math:`G`
 
- - **doesnt't know** which of these two distributions that nature has drawn 
+- **doesn't know** which of these two distributions that nature has drawn
 
- - summarizing his ignorance by acting  as if or **thinking** that nature chose distribution :math:`F` with probability :math:`\tilde \pi \in (0,1)` and distribution
-   :math:`G` with probability :math:`1 - \tilde \pi` 
+- summarizing his ignorance by acting  as if or **thinking** that nature chose distribution :math:`F` with probability :math:`\tilde \pi \in (0,1)` and distribution
+  :math:`G` with probability :math:`1 - \tilde \pi`
 
 
- - at date :math:`t \geq 0` has observed  the partial history :math:`w_t, w_{t-1}, \ldots, w_0` of draws from the appropriate joint
-   density of the partial history
+- at date :math:`t \geq 0` has observed  the partial history :math:`w_t, w_{t-1}, \ldots, w_0` of draws from the appropriate joint
+  density of the partial history
 
 But what do we mean by the *appropriate joint distribution*?
 
 We'll discuss that next and in the process describe the concept of **exchangeability**.
 
-Relationship Between IID and Exchangeable 
+Relationship Between IID and Exchangeable
 ==========================================
 
-
-
-
 Conditional on nature selecting :math:`F`, the joint density of the
 sequence :math:`W_0, W_1, \ldots` is
 
-.. math::  f(W_0) f(W_1) \cdots 
+.. math::  f(W_0) f(W_1) \cdots
 
 Conditional on nature selecting :math:`G`, the joint density of the
 sequence :math:`W_0, W_1, \ldots` is
 
-.. math::  g(W_0) g(W_1) \cdots 
+.. math::  g(W_0) g(W_1) \cdots
 
 Notice that **conditional on nature having selected** :math:`F`, the
 sequence :math:`W_0, W_1, \ldots` is independently and
@@ -223,7 +218,7 @@ evidently
 .. math::
     :label: eq_definetti
 
-    h(W_0, W_1, \ldots ) \equiv \tilde \pi [f(W_0) f(W_1) \cdots ] + ( 1- \tilde \pi) [g(W_0) g(W_1) \cdots ] 
+    h(W_0, W_1, \ldots ) \equiv \tilde \pi [f(W_0) f(W_1) \cdots ] + ( 1- \tilde \pi) [g(W_0) g(W_1) \cdots ]
 
 Under the unconditional distribution :math:`h(W_0, W_1, \ldots )`, the
 sequence :math:`W_0, W_1, \ldots` is **not** independently and
@@ -235,28 +230,28 @@ To verify this claim, it is sufficient to notice, for example, that
 
     h(w_0, w_1) = \tilde \pi f(w_0)f (w_1) + (1 - \tilde \pi) g(w_0)g(w_1) \neq
                   (\tilde \pi f(w_0) + (1-\tilde \pi) g(w_0))(
-                   \tilde \pi f(w_1) + (1-\tilde \pi) g(w_1))  
+                   \tilde \pi f(w_1) + (1-\tilde \pi) g(w_1))
 
 Thus, the conditional distribution
 
 .. math::
 
     h(w_1 | w_0) \equiv \frac{h(w_0, w_1)}{(\tilde \pi f(w_0) + (1-\tilde \pi) g(w_0))}
-     \neq ( \tilde \pi f(w_1) + (1-\tilde \pi) g(w_1)) 
+     \neq ( \tilde \pi f(w_1) + (1-\tilde \pi) g(w_1))
 
 This means that the realization :math:`w_0` contains information about :math:`w_1`.
 
-So there is something to learn.  
+So there is something to learn.
 
 But what and how?
 
-Exchangeability 
-================= 
+Exchangeability
+=================
 
 While the sequence :math:`W_0, W_1, \ldots` is not IID, it can be verified that it is
 **exchangeable**, which means that
 
-.. math::  h(w_0, w_1) = h(w_1, w_0) 
+.. math::  h(w_0, w_1) = h(w_1, w_0)
 
 and so on.
 
@@ -265,7 +260,7 @@ More generally, a sequence of random variables is said to be **exchangeable** if
 for the sequence does not change when the positions in the sequence in which finitely many of the random variables
 appear are altered.
 
-Equation :eq:`eq_definetti` represents our instance of an exchangeable joint density over a sequence of random 
+Equation :eq:`eq_definetti` represents our instance of an exchangeable joint density over a sequence of random
 variables  as a **mixture**  of  two IID joint densities over a sequence of random variables.
 
 For a Bayesian statistician, the mixing parameter :math:`\tilde \pi \in (0,1)` has a special interpretation
@@ -274,20 +269,20 @@ as a **prior probability** that nature selected probability distribution :math:`
 DeFinetti :cite:`definetti` established a related representation of an exchangeable process created by mixing
 sequences of IID Bernoulli random variables with parameters :math:`\theta` and mixing probability :math:`\pi(\theta)`
 for a density :math:`\pi(\theta)` that a Bayesian statistician would interpret as a prior over the unknown
-Bernoulli paramter :math:`\theta`.
+Bernoulli parameter :math:`\theta`.
 
 
 Bayes' Law
 ============
 
-We noted above that in our example model there is something to learn about about the future from past data drawn 
+We noted above that in our example model there is something to learn about about the future from past data drawn
 from our particular instance of a process that is exchangeable but not IID.
 
 But how can we learn?
 
 And about what?
 
-The answer to the *about what* question is about :math:`\tilde pi`.
+The answer to the *about what* question is about :math:`\tilde \pi`.
 
 The answer to the *how* question is to use  Bayes' Law.
 
@@ -300,24 +295,24 @@ Let's dive into Bayes' Law in this context.
 Let :math:`q` represent the distribution that nature actually draws from
 :math:`w` from and let
 
-.. math::   \pi = \mathbb{P}\{q = f \} 
+.. math::   \pi = \mathbb{P}\{q = f \}
 
-where we regard :math:`\pi` as the decision maker's **subjective probability**  (also called a **personal probability**.
+where we regard :math:`\pi` as the decision maker's **subjective probability**  (also called a **personal probability**).
 
 Suppose that at :math:`t \geq 0`, the decision maker has  observed a history
 :math:`w^t \equiv [w_t, w_{t-1}, \ldots, w_0]`.
 
 We let
 
-.. math::  \pi_t  = \mathbb{P}\{q = f  | w^t \} 
+.. math::  \pi_t  = \mathbb{P}\{q = f  | w^t \}
 
 where we adopt the convention
 
-.. math:: \pi_{-1}  = \tilde \pi 
+.. math:: \pi_{-1}  = \tilde \pi
 
 The distribution of :math:`w_{t+1}` conditional on :math:`w^t` is then
 
-.. math::  \pi_t f + (1 - \pi_t) g . 
+.. math::  \pi_t f + (1 - \pi_t) g .
 
 Bayes’ rule for updating :math:`\pi_{t+1}` is
 
@@ -339,7 +334,7 @@ tells us that
    \quad \text{and} \quad
    \mathbb{P}\{W = w\} = \sum_{\omega \in \{f, g\}} \mathbb{P}\{W = w \,|\, q = \omega\} \mathbb{P}\{q = \omega\}
 
-More Details about Bayesian Updating 
+More Details about Bayesian Updating
 =====================================
 
 Let's stare at and rearrange Bayes' Law as represented in equation :eq:`eq_Bayes102` with the aim of understanding
@@ -355,7 +350,6 @@ It is convenient for us to rewrite the updating rule :eq:`eq_Bayes102` as
 
 .. math::
 
-
    \pi_{t+1}   =\frac{\pi_{t}f\left(w_{t+1}\right)}{\pi_{t}f\left(w_{t+1}\right)+\left(1-\pi_{t}\right)g\left(w_{t+1}\right)}
        =\frac{\pi_{t}\frac{f\left(w_{t+1}\right)}{g\left(w_{t+1}\right)}}{\pi_{t}\frac{f\left(w_{t+1}\right)}{g\left(w_{t+1}\right)}+\left(1-\pi_{t}\right)}
        =\frac{\pi_{t}l\left(w_{t+1}\right)}{\pi_{t}l\left(w_{t+1}\right)+\left(1-\pi_{t}\right)}
@@ -366,23 +360,23 @@ This implies that
 .. math::
    :label: eq_Bayes103
 
-   \frac{\pi_{t+1}}{\pi_{t}}=\frac{l\left(w_{t+1}\right)}{\pi_{t}l\left(w_{t+1}\right)+\left(1-\pi_{t}\right)}\begin{cases} >1 & 
+   \frac{\pi_{t+1}}{\pi_{t}}=\frac{l\left(w_{t+1}\right)}{\pi_{t}l\left(w_{t+1}\right)+\left(1-\pi_{t}\right)}\begin{cases} >1 &
    \text{if }l\left(w_{t+1}\right)>1\\
    \leq1 & \text{if }l\left(w_{t+1}\right)\leq1
    \end{cases}
 
-Notice how the likelihood ratio and the prior interact to determine whether an observation :math:`w_{t+1}` leads the decision maker 
-to increase or decrease the subjective probability he/she attaches to distribution :math:`F`. 
+Notice how the likelihood ratio and the prior interact to determine whether an observation :math:`w_{t+1}` leads the decision maker
+to increase or decrease the subjective probability he/she attaches to distribution :math:`F`.
 
-When the likelihood ratio :math:`l(w_{t+1})` exceeds one, the observation :math:`w_{t+1}` nudges the probability 
+When the likelihood ratio :math:`l(w_{t+1})` exceeds one, the observation :math:`w_{t+1}` nudges the probability
 :math:`\pi` put on distribution :math:`F` upward,
 and when the likelihood ratio :math:`l(w_{t+1})` is less that  one, the observation :math:`w_{t+1}` nudges :math:`\pi` downward.
 
 
 
-Representation :eq:`eq_Bayes103` is the foundation of the graphs that we'll use to display the dynamics of 
+Representation :eq:`eq_Bayes103` is the foundation of the graphs that we'll use to display the dynamics of
 :math:`\{\pi_t\}_{t=0}^\infty` that are  induced by
-Bayes' Law.  
+Bayes' Law.
 
 We’ll plot :math:`l\left(w\right)` as a way to enlighten us about how
 learning – i.e., Bayesian updating of the probability :math:`\pi` that
@@ -486,29 +480,26 @@ We'll begin with the default values of various objects, then change them in a su
     learning_example()
 
 Please look at the three graphs above created for an instance in which :math:`f` is a uniform distribution on :math:`[0,1]`
-(i.e., a Beta distribution with parameters :math:`F_a=1, F_b=1`, while  :math:`g` is a Beta distribution with the default parameter values :math:`G_a=3, G_b=1.2`.  
-
+(i.e., a Beta distribution with parameters :math:`F_a=1, F_b=1`), while  :math:`g` is a Beta distribution with the default parameter values :math:`G_a=3, G_b=1.2`.
 
+The graph on the left  plots the likelihood ratio :math:`l(w)` on the coordinate axis against :math:`w` on the ordinate axis.
 
-The graph in the left  plots the likehood ratio :math:`l(w)` on the coordinate axis against :math:`w` on the coordinate axis.
-
-The middle graph plots both :math:`f(w)` and :math:`g(w)`  against :math:`w`, with the horizontal dotted lines showing values 
+The middle graph plots both :math:`f(w)` and :math:`g(w)`  against :math:`w`, with the horizontal dotted lines showing values
 of :math:`w` at which the likelihood ratio equals :math:`1`.
 
-The graph on the right side plots arrows to the right that show when Bayes' Law  makes :math:`\pi` increase and arrows 
-to the left that show when Bayes' Law make :math:`\pi` decrease.  
+The graph on the right plots arrows to the right that show when Bayes' Law  makes :math:`\pi` increase and arrows
+to the left that show when Bayes' Law make :math:`\pi` decrease.
 
 Notice how the length of the arrows, which show the magnitude of the force from Bayes' Law impelling :math:`\pi` to change,
-depend on both the prior probability :math:`\pi` on the ordinate axis and the evidence in the form of the current draw of 
-:math:`w` on the coordinate axis.  
-
+depends on both the prior probability :math:`\pi` on the ordinate axis and the evidence in the form of the current draw of
+:math:`w` on the coordinate axis.
 
 The fractions in the colored areas of the middle graphs are probabilities under :math:`F` and :math:`G`, respectively,
 that  realizations of :math:`w` fall
-into the interval that updates the belief :math:`\pi` in a correct direction (i.e., toward :math:`0` when :math:`G` is the true 
+into the interval that updates the belief :math:`\pi` in a correct direction (i.e., toward :math:`0` when :math:`G` is the true
 distribution, and towards :math:`1` when :math:`F` is the true distribution).
 
-For example, 
+For example,
 in the above  example, under true distribution :math:`F`,  :math:`\pi` will  be updated toward :math:`0` if :math:`w` falls into the interval
 :math:`[0.524, 0.999]`, which occurs with probability :math:`1 - .524 = .476` under :math:`F`.  But this
 would occur with probability
@@ -519,32 +510,30 @@ in the orange region is the integral of :math:`g(w)` over this interval.
 Next we use our code to create graphs for another instance of our model.
 
 We keep :math:`F` the same as in the preceding instance, namely a uniform distribution, but now assume that :math:`G`
-is a Beta distribution with parameters :math:`G_a=2, G_b=1.6`. 
+is a Beta distribution with parameters :math:`G_a=2, G_b=1.6`.
 
 .. code-block:: python3
 
     learning_example(G_a=2, G_b=1.6)
 
-Notice how the likelihood ratio, the middle graph, and the arrows compare with the previous instance of our example.    
+Notice how the likelihood ratio, the middle graph, and the arrows compare with the previous instance of our example.
 
-
-
-Appendix 
+Appendix
 =========
 
 Sample Paths of :math:`\pi_t`
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Now we'll have some fun by plotting multiple realizations of sample paths of :math:`\pi_t` under two possible 
+Now we'll have some fun by plotting multiple realizations of sample paths of :math:`\pi_t` under two possible
 assumptions about nature's choice of distribution:
 
--  that nature permanently draws from :math:`F`
+- that nature permanently draws from :math:`F`
 
--  that nature permanently draws from :math:`G`
+- that nature permanently draws from :math:`G`
 
 
-Outcomes depend on a peculiar property of likelihood ratio processes that are discussed in 
-`this lecture <https://python-advanced.quantecon.org/additive_functionals.html>`__ 
+Outcomes depend on a peculiar property of likelihood ratio processes that are discussed in
+`this lecture <https://python-advanced.quantecon.org/additive_functionals.html>`__.
 
 To do this, we create some Python code.
 
@@ -632,10 +621,10 @@ periods when the sequence is truly IID draws from :math:`G`. Again, we set the i
     # when nature selects G
     π_paths_G = simulate(a=3, b=1.2, T=T, N=1000)
 
-In  the above graph we observe that now  most paths :math:`\pi_t \rightarrow 0`.    
+In the above graph we observe that now  most paths :math:`\pi_t \rightarrow 0`.
 
 
-Rates of convergence 
+Rates of convergence
 ^^^^^^^^^^^^^^^^^^^^^
 
 We study rates of  convergence of :math:`\pi_t` to :math:`1` when nature generates the data as IID draws from :math:`F`
@@ -667,7 +656,6 @@ to the pertinent probability distribution:
 
 .. math::
 
-
    \begin{aligned}
    E\left[\frac{\pi_{t+1}}{\pi_{t}}\biggm|q=\omega, \pi_{t}\right] &=E\left[\frac{l\left(w_{t+1}\right)}{\pi_{t}l\left(w_{t+1}\right)+\left(1-\pi_{t}\right)}\biggm|q=\omega, \pi_{t}\right], \\
        &=\int_{0}^{1}\frac{l\left(w_{t+1}\right)}{\pi_{t}l\left(w_{t+1}\right)+\left(1-\pi_{t}\right)}\omega\left(w_{t+1}\right)dw_{t+1}
@@ -711,13 +699,11 @@ First, consider the case where :math:`F_a=F_b=1` and
 
     expected_ratio()
 
-The above graphs shows that when :math:`F` generates the data, :math:`\pi_t` on average always heads north, while 
-when :math:`G` generates the data, :math:`\pi_t` heads south. 
-
-
+The above graphs shows that when :math:`F` generates the data, :math:`\pi_t` on average always heads north, while
+when :math:`G` generates the data, :math:`\pi_t` heads south.
 
 Next, we'll look at a degenerate case in whcih  :math:`f` and :math:`g` are identical beta
-distributions, and :math:`F_a=G_a=3, F_b=G_b=1.2`. 
+distributions, and :math:`F_a=G_a=3, F_b=G_b=1.2`.
 
 In a sense, here  there
 is nothing to learn.
@@ -744,7 +730,7 @@ We'll dig deeper into some of the ideas used here in the following lectures:
 * :doc:`this lecture <likelihood_ratio_process>` describes **likelihood ratio processes**
   and their role in frequentist and Bayesian statistical theories
 
-* :doc:`this lecture <navy_captain>` returns to the subject of this lecture and studies 
-  whether the Captain's hunch that the (frequentist) decision rule that the Navy had ordered 
-  him to use can be expected to be better or worse than the rule sequential rule that Abraham 
-  Wald designed 
\ No newline at end of file
+* :doc:`this lecture <navy_captain>` returns to the subject of this lecture and studies
+  whether the Captain's hunch that the (frequentist) decision rule that the Navy had ordered
+  him to use can be expected to be better or worse than the rule sequential rule that Abraham
+  Wald designed
\ No newline at end of file
diff --git a/source/rst/finite_markov.rst b/source/rst/finite_markov.rst
index 84e5b67..1b55fa4 100644
--- a/source/rst/finite_markov.rst
+++ b/source/rst/finite_markov.rst
@@ -15,7 +15,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 =============
@@ -419,8 +419,6 @@ all  ways this can happen and sum their probabilities.
 
 Rewriting this statement in terms of  marginal and conditional probabilities gives
 
-.. _mc_fdd:
-
 .. math::
 
     \psi_{t+1}(y) = \sum_{x \in S} P(x,y) \psi_t(x)
@@ -430,8 +428,6 @@ There are :math:`n` such equations, one for each :math:`y \in S`.
 
 If we think of :math:`\psi_{t+1}` and :math:`\psi_t` as *row vectors* (as is traditional in this literature), these :math:`n` equations are summarized by the matrix expression
 
-.. _mc_fddv:
-
 .. math::
     :label: fin_mc_fr
 
@@ -444,8 +440,6 @@ By repeating this :math:`m` times we move forward :math:`m` steps into the futur
 
 Hence, iterating on :eq:`fin_mc_fr`, the expression :math:`\psi_{t+m} = \psi_t P^m` is also valid --- here :math:`P^m` is the :math:`m`-th power of :math:`P`.
 
-.. _mc_exfmar:
-
 As a special case, we see that if :math:`\psi_0` is the initial distribution from
 which :math:`X_0` is drawn, then :math:`\psi_0 P^m` is the distribution of
 :math:`X_m`.
@@ -683,7 +677,7 @@ Aperiodicity
 ----------------
 
 
-Loosely speaking, a Markov chain is called periodic if it cycles in a predictible way, and aperiodic otherwise.
+Loosely speaking, a Markov chain is called periodic if it cycles in a predictable way, and aperiodic otherwise.
 
 Here's a trivial example with three states
 
@@ -885,7 +879,7 @@ with the unit eigenvalue :math:`\lambda = 1`.
 
 A more stable and sophisticated algorithm is implemented in `QuantEcon.py <http://quantecon.org/quantecon-py>`__.
 
-This is the one we recommend you use:
+This is the one we recommend you to use:
 
 .. code-block:: python3
 
@@ -1194,7 +1188,7 @@ A topic of interest for economics and many other disciplines is *ranking*.
 Let's now consider one of the most practical and important ranking problems
 --- the rank assigned to web pages by search engines.
 
-(Although the problem is motivated from outside of economics, there is in fact a deep connection between search ranking systems and prices in certain competitive equilibria --- see :cite:`DLP2013`)
+(Although the problem is motivated from outside of economics, there is in fact a deep connection between search ranking systems and prices in certain competitive equilibria --- see :cite:`DLP2013`.)
 
 To understand the issue, consider the set of results returned by a query to a web search engine.
 
@@ -1400,23 +1394,23 @@ The values :math:`P(x_i, x_j)` are computed to approximate the AR(1) process ---
 
 1. If :math:`j = 0`, then set
 
-.. math::
+    .. math::
 
-    P(x_i, x_j) = P(x_i, x_0) = F(x_0-\rho x_i + s/2)
+        P(x_i, x_j) = P(x_i, x_0) = F(x_0-\rho x_i + s/2)
 
 
 2. If :math:`j = n-1`, then set
 
-.. math::
+    .. math::
 
-    P(x_i, x_j) = P(x_i, x_{n-1}) = 1 - F(x_{n-1} - \rho x_i - s/2)
+        P(x_i, x_j) = P(x_i, x_{n-1}) = 1 - F(x_{n-1} - \rho x_i - s/2)
 
 
 3. Otherwise, set
 
-.. math::
+    .. math::
 
-    P(x_i, x_j) = F(x_j - \rho x_i + s/2) - F(x_j - \rho x_i - s/2)
+        P(x_i, x_j) = F(x_j - \rho x_i + s/2) - F(x_j - \rho x_i - s/2)
 
 
 The exercise is to write a function ``approx_markov(rho, sigma_u, m=3, n=7)`` that returns
diff --git a/source/rst/geom_series.rst b/source/rst/geom_series.rst
index e288dc8..5b794e4 100644
--- a/source/rst/geom_series.rst
+++ b/source/rst/geom_series.rst
@@ -92,7 +92,7 @@ equation :math:`1 = 1`.
 Finite Geometric Series
 -----------------------
 
-The second series that interests us is the finite geomtric series
+The second series that interests us is the finite geometric series
 
 .. math:: 1 + c + c^2 + c^3 + \cdots + c^T
 
@@ -292,7 +292,7 @@ Static Version
 
 
 An elementary Keynesian model of national income determination consists
-of three equations that describe aggegate demand for :math:`y` and its
+of three equations that describe aggregate demand for :math:`y` and its
 components.
 
 The first equation is a national income identity asserting that
@@ -360,7 +360,7 @@ We modify our consumption function to assume the form
 so that :math:`b` is the marginal propensity to consume (now) out of
 last period's income.
 
-We begin wtih an initial condition stating that
+We begin with an initial condition stating that
 
 .. math:: y_{-1} = 0
 
diff --git a/source/rst/harrison_kreps.rst b/source/rst/harrison_kreps.rst
index 7f7a12b..1a7ddc0 100644
--- a/source/rst/harrison_kreps.rst
+++ b/source/rst/harrison_kreps.rst
@@ -5,7 +5,7 @@
 .. highlight:: python3
 
 *************************************
-Asset Pricing with Incomplete Markets
+Heterogeneous Beliefs and Bubbles
 *************************************
 
 .. index::
@@ -13,12 +13,12 @@ Asset Pricing with Incomplete Markets
 
 .. contents:: :depth: 2
 
-In addition to what's in Anaconda, this lecture will need the following libraries:
+In addition to what's in Anaconda, this lecture uses following libraries:
 
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 =========
@@ -71,14 +71,16 @@ Structure of the Model
 ======================
 
 
-The model simplifies by ignoring alterations in the distribution of wealth
-among investors having different beliefs about the fundamentals that determine
+The model simplifies things  by ignoring alterations in the distribution of wealth
+among investors who have hard-wired different beliefs about the fundamentals that determine
 asset payouts.
 
 There is a fixed number :math:`A` of shares of an asset.
 
 Each share entitles its owner to a stream of dividends :math:`\{d_t\}` governed by a Markov chain defined on a state space :math:`S \in \{0, 1\}`.
 
+Thus, the stock is traded **ex dividend**. 
+
 The dividend obeys
 
 .. math::
@@ -122,6 +124,9 @@ Investors of  type :math:`b` think the transition matrix is
             \frac{1}{4} & \frac{3}{4}
         \end{bmatrix}
 
+Thus,  in state :math:`0`,  a type :math:`a` investor is more optimistic  about next period's dividend than is investor :math:`b`.
+
+But in state :math:`1`,  a type :math:`a` investor is more pessimistic  about next period's dividend than is investor :math:`b`.
 
 The stationary (i.e., invariant) distributions of these two matrices can be calculated as follows:
 
@@ -141,10 +146,11 @@ The stationary (i.e., invariant) distributions of these two matrices can be calc
 
 
 
-The stationary distribution of :math:`P_a` is approximately :math:`\pi_A = \begin{bmatrix} .57 & .43 \end{bmatrix}`.
+The stationary distribution of :math:`P_a` is approximately :math:`\pi_a = \begin{bmatrix} .57 & .43 \end{bmatrix}`.
 
-The stationary distribution of :math:`P_b` is approximately :math:`\pi_B = \begin{bmatrix} .43 & .57 \end{bmatrix}`.
+The stationary distribution of :math:`P_b` is approximately :math:`\pi_b = \begin{bmatrix} .43 & .57 \end{bmatrix}`.
 
+Thus, a type :math:`a` investor is more pessimistic on average.  
 
 Ownership Rights
 ----------------
@@ -155,7 +161,7 @@ Both types of investors are risk-neutral and both have the same fixed discount f
 
 In our numerical example, we’ll set :math:`\beta = .75`, just as Harrison and Kreps did.
 
-We’ll eventually study the consequences of two different assumptions about the number of shares :math:`A` relative to the resources that our two types of investors can invest in the stock.
+We’ll eventually study the consequences of two alternative assumptions about the number of shares :math:`A` relative to the resources that our two types of investors can invest in the stock.
 
 #. Both types of investors have enough resources (either wealth or the capacity to borrow) so that they can purchase the entire available stock of the asset [#f1]_.
 
@@ -171,11 +177,11 @@ Short Sales Prohibited
 
 No short sales are allowed.
 
-This matters because it limits pessimists from expressing their opinions.
+This matters because it limits how  pessimists can express their opinion.
 
-* They can express their views by selling their shares.
+* They **can** express themselves by selling their shares.
 
-* They cannot express their pessimism more loudly by artificially "manufacturing shares" -- that is, they cannot borrow shares from more optimistic investors and sell them immediately.
+* They **cannot** express themsevles  more loudly by artificially "manufacturing shares" -- that is, they cannot borrow shares from more optimistic investors and then immediately sell them.
 
 
 Optimism and Pessimism
@@ -189,33 +195,7 @@ Remember that state :math:`1` is the high dividend state.
 
 * In state :math:`1`, a type :math:`b` agent is more optimistic about next period's dividend.
 
-However, the stationary distributions :math:`\pi_A = \begin{bmatrix} .57 & .43 \end{bmatrix}` and :math:`\pi_B = \begin{bmatrix} .43 & .57 \end{bmatrix}` tell us that a type :math:`B` person is more optimistic about the dividend process in the long run than is a type A person.
-
-Transition matrices for the temporarily optimistic and pessimistic investors are constructed as follows.
-
-Temporarily optimistic investors (i.e., the investor with the most optimistic
-beliefs in each state) believe the transition matrix
-
-.. math::
-
-    P_o =
-        \begin{bmatrix}
-            \frac{1}{2} & \frac{1}{2} \\
-            \frac{1}{4} & \frac{3}{4}
-        \end{bmatrix}
-
-
-Temporarily pessimistic believe the transition matrix
-
-.. math::
-
-    P_p =
-        \begin{bmatrix}
-            \frac{1}{2} & \frac{1}{2} \\
-            \frac{1}{4} & \frac{3}{4}
-        \end{bmatrix}
-
-We'll return to these matrices and their significance in the exercise.
+However, the stationary distributions :math:`\pi_a = \begin{bmatrix} .57 & .43 \end{bmatrix}` and :math:`\pi_b = \begin{bmatrix} .43 & .57 \end{bmatrix}` tell us that a type :math:`B` person is more optimistic about the dividend process in the long run than is a type :math:`A` person.
 
 Information
 --------------
@@ -233,9 +213,9 @@ Solving the Model
 
 Now let's turn to solving the model.
 
-This amounts to determining equilibrium prices under the different possible specifications of beliefs and constraints listed above.
+We'll  determine equilibrium prices under a particular specification of beliefs and constraints on trading selected from one of the specifications described above.
 
-In particular, we compare equilibrium price functions under the following alternative
+We shall compare equilibrium price functions under the following alternative
 assumptions about beliefs:
 
 #. There is only one type of agent, either :math:`a` or :math:`b`.
@@ -250,9 +230,9 @@ Summary Table
 ---------------
 
 The following table gives a summary of the findings obtained in the remainder of the lecture
-(you will be asked to recreate the table in an exercise).
+(in an exercise you will be asked to recreate  the  table and also reinterpret parts of it).
 
-It records implications of Harrison and Kreps's specifications of :math:`P_a, P_b, \beta`.
+The table reports  implications of Harrison and Kreps's specifications of :math:`P_a, P_b, \beta`.
 
 .. raw:: html
 
@@ -290,6 +270,12 @@ Here
 
 We'll explain these values and how they are calculated one row at a time.
 
+The row corresponding to :math:`p_o` applies when both types of investor have enough resources to purchse the entire stock of the asset and strict short sales constraints prevail so that  temporarily optimistic investors always price the asset.
+
+The row corresponding to :math:`p_p` would apply if neither type of investor has enough resources to purchase the entire stock of the asset and both types must hold the asset. 
+
+The row corresponding to :math:`p_p` would also  apply if both types have enough resources to buy the entire stock of the asset but  short sales are also  possible so that   temporarily pessimistic   investors price the asset.
+
 
 Single Belief Prices
 --------------------
@@ -373,6 +359,9 @@ In this case, the marginal investor who prices the asset is the more optimistic
 
 for :math:`s=0,1`.
 
+In the above equation, the :math:`max` on the right side is evidently over two prospective values of next period's payout
+from owning the asset. 
+
 The marginal investor who prices the asset in state :math:`s` is of type :math:`a` if
 
 .. math::
@@ -412,19 +401,19 @@ Equation :eq:`hakr2` is a functional equation that, like a Bellman equation, can
 
 for :math:`s=0,1`.
 
-The third row of the table reports equilibrium prices that solve the functional equation when :math:`\beta = .75`.
+The third row of the table labeled :math:`p_o` reports equilibrium prices that solve the functional equation when :math:`\beta = .75`.
 
 Here the type that is optimistic about :math:`s_{t+1}` prices the asset in state :math:`s_t`.
 
-It is instructive to compare these prices with the equilibrium prices for the homogeneous belief economies that solve under beliefs :math:`P_a` and :math:`P_b`.
+It is instructive to compare these prices with the equilibrium prices for the homogeneous belief economies that solve under beliefs :math:`P_a` and :math:`P_b` reported in the rows labeled :math:`p_a` and :math:`p_b`, respectively.
 
-Equilibrium prices :math:`\bar p` in the heterogeneous beliefs economy exceed what any prospective investor regards as the fundamental value of the asset in each possible state.
+Equilibrium prices :math:`p_o` in the heterogeneous beliefs economy evidently exceed what any prospective investor regards as the fundamental value of the asset in each possible state.
 
 Nevertheless, the economy recurrently visits a state that makes each investor want to
 purchase the asset for more than he believes its future dividends are
 worth.
 
-The reason is that he expects to have the option to sell the asset later to another investor who will value the asset more highly than he will.
+The reason that an investor is willing to pay more than what he believes is warranted by fundamental value of the prospective dividend stream is he expects to have the option to sell the asset later to another investor who will value the asset more highly than he will.
 
 * Investors of type :math:`a` are willing to pay the following price for the asset
 
@@ -516,11 +505,11 @@ and the marginal investor who prices the asset is always the one that values it
 
 Now the marginal investor is always the (temporarily) pessimistic type.
 
-Notice from the sixth row of that the pessimistic price :math:`\underline p` is lower than the homogeneous belief prices :math:`p_a` and :math:`p_b` in both states.
+Notice from the sixth row of that the pessimistic price :math:`p_o` is lower than the homogeneous belief prices :math:`p_a` and :math:`p_b` in both states.
 
 When pessimistic investors price the asset according to :eq:`HarrKrep4`, optimistic investors think that the asset is underpriced.
 
-If they could, optimistic investors would willingly borrow at the one-period gross interest rate :math:`\beta^{-1}` to purchase more of the asset.
+If they could, optimistic investors would willingly borrow at a  one-period risk-free gross interest rate :math:`\beta^{-1}` to purchase more of the asset.
 
 Implicit constraints on leverage prohibit them from doing so.
 
@@ -559,7 +548,7 @@ Here's code to solve for :math:`\check p` using iteration
 Further Interpretation
 -------------------------
 
-:cite:`Scheinkman2014` interprets the Harrison-Kreps model as a model of a bubble --- a situation in which an asset price exceeds what every investor thinks is merited by the asset's underlying dividend stream.
+:cite:`Scheinkman2014` interprets the Harrison-Kreps model as a model of a bubble --- a situation in which an asset price exceeds what every investor thinks is merited by his or her beliefs about the value of the asset's underlying dividend stream.
 
 Scheinkman stresses these features of the Harrison-Kreps model:
 
@@ -573,7 +562,7 @@ Scheinkman takes this as a strength of the model because he observes high volume
 
 * If the *supply* of the asset is increased sufficiently either physically (more "houses" are built) or artificially (ways are invented to short sell "houses"), bubbles end when the supply has grown enough to outstrip optimistic investors’ resources for purchasing the asset.
 
-* If optimistic investors finance purchases by borrowing, tightening leverage constraints can extinguish a bubble.
+* If optimistic investors finance their purchases by borrowing, tightening leverage constraints can extinguish a bubble.
 
 Scheinkman extracts insights about the effects of financial regulations on bubbles.
 
@@ -585,7 +574,7 @@ Exercises
 Exercise 1
 -----------
 
-Recreate the summary table using the functions we have built above.
+This exercise invites you to recreate the summary table using the functions we have built above.
 
 +-----------------------+------+------+
 |      :math:`s_t`      |  0   |  1   |
@@ -604,7 +593,38 @@ Recreate the summary table using the functions we have built above.
 +-----------------------+------+------+
 
 
-You will first need to define the transition matrices and dividend payoff vector.
+You will want first  to define the transition matrices and dividend payoff vector.
+
+In addition, below we'll add an interpretation of the row corresponding to :math:`p_o` by 
+inventing two additional types of agents, one of whom is **permanently optimistic**, the other who
+is **permanently pessimistic**.
+
+
+We construct subjective transition probability matrices for our permanently  optimistic and permanently pessimistic investors as follows.
+
+The permanently optimistic investors(i.e., the investor with the most optimistic
+beliefs in each state) believes the transition matrix
+
+.. math::
+
+    P_o =
+        \begin{bmatrix}
+            \frac{1}{2} & \frac{1}{2} \\
+            \frac{1}{4} & \frac{3}{4}
+        \end{bmatrix}
+
+
+The permanently pessimistic investor believes the transition matrix
+
+.. math::
+
+    P_p =
+        \begin{bmatrix}
+            \frac{2}{3} & \frac{1}{3} \\
+            \frac{2}{3} & \frac{1}{3}
+        \end{bmatrix}
+
+We'll use these transition matrices when we present our solution of exercise 1 below.
 
 
 Solutions
@@ -664,7 +684,7 @@ heterogeneous beliefs.
 
 
 Notice that the equilibrium price with heterogeneous beliefs is equal to the price under single beliefs
-with optimistic investors - this is due to the marginal investor being the temporarily optimistic type.
+with **permanently optimistic** investors - this is due to the marginal investor in the heterogeneous beliefs equilibrium always being the type who is  temporarily optimistic.
 
 .. rubric:: Footnotes
 
diff --git a/source/rst/heavy_tails.rst b/source/rst/heavy_tails.rst
index 4eab041..3797911 100644
--- a/source/rst/heavy_tails.rst
+++ b/source/rst/heavy_tails.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install --upgrade yfinance
 
 
@@ -194,7 +194,7 @@ where :math:`\mu := \mathbb E X_i = \int x F(x)` is the common mean of the sampl
 The condition :math:`\mathbb E | X_i | = \int |x| F(x) < \infty` holds
 in most cases but can fail if the distribution :math:`F` is very heavy tailed.
 
-For example, it fails for the Cauchy distribution 
+For example, it fails for the Cauchy distribution.
 
 Let's have a look at the behavior of the sample mean in this case, and see
 whether or not the LLN is still valid.
@@ -417,7 +417,7 @@ Exercise 4
 
 Replicate the rank-size plot figure :ref:`presented above <rank_size_fig1>`.
 
-If you like you can use the function ``qe.rank_size_plot`` from the ``quantecon`` library to generate the plots.
+If you like you can use the function ``qe.rank_size`` from the ``quantecon`` library to generate the plots.
 
 Use ``np.random.seed(13)`` to set the seed.
 
@@ -597,7 +597,12 @@ Now we plot the data:
     
     for data, label, ax in zip(data_list, labels, axes):
 
-        qe.rank_size_plot(data, ax, label=label)
+        rank_data, size_data = qe.rank_size(data)
+
+        ax.loglog(rank_data, size_data, 'o', markersize=3.0, alpha=0.5, label=label)
+        ax.set_xlabel("log rank")
+        ax.set_ylabel("log size")
+
         ax.legend()
     
     fig.subplots_adjust(hspace=0.4)
@@ -632,7 +637,7 @@ the equations
     \quad \text{and} \quad
     2^{1/\alpha} = \exp(\mu)
 
-which we solve for :math:`\mu` and :math:`\sigma` given :math:`\alpha = 1.05`
+which we solve for :math:`\mu` and :math:`\sigma` given :math:`\alpha = 1.05`.
     
 Here is code that generates the two samples, produces the violin plot and
 prints the mean and standard deviation of the two samples.
diff --git a/source/rst/ifp.rst b/source/rst/ifp.rst
index 04e887e..cf54dde 100644
--- a/source/rst/ifp.rst
+++ b/source/rst/ifp.rst
@@ -13,7 +13,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 
@@ -36,7 +36,7 @@ model <optgrowth>` and yet differs in important ways.
 
 For example, the choice problem for the agent includes an additive income term that leads to an occasionally binding constraint.
 
-Moreover, in this and the following lectures, we will inject more realisitic
+Moreover, in this and the following lectures, we will inject more realistic
 features such as correlated shocks.
 
 To solve the model we will use Euler equation based time iteration, which proved
@@ -52,7 +52,8 @@ We'll need the following imports:
     import numpy as np
     from quantecon.optimize import brent_max, brentq
     from interpolation import interp
-    from numba import njit, float64, jitclass
+    from numba import njit, float64
+    from numba.experimental import jitclass
     import matplotlib.pyplot as plt
     %matplotlib inline
     from quantecon import MarkovChain
@@ -199,7 +200,7 @@ strict inequality :math:`u' (c_t) > \beta R \,  \mathbb{E}_t  u'(c_{t+1})`
 can occur because :math:`c_t` cannot increase sufficiently to attain equality.
 
 (The lower boundary case :math:`c_t = 0` never arises at the optimum because
-:math:`u'(0) = \infty`)
+:math:`u'(0) = \infty`.)
 
 With some thought, one can show that :eq:`ee00` and :eq:`ee01` are
 equivalent to
@@ -424,8 +425,7 @@ Next we provide a function to compute the difference
 .. math::
     :label: euler_diff_eq
 
-    u'(c)
-    - \max \left\{
+    u'(c) - \max \left\{
                \beta R \, \mathbb E_z (u' \circ \sigma) \, 
                [R (a - c) + \hat Y, \, \hat Z]
                \, , \;
@@ -663,7 +663,7 @@ shocks.
 Your task is to investigate how this measure of aggregate capital varies with
 the interest rate.
 
-Following tradition, put the price (i.e., interest rate) is on the vertical axis.
+Following tradition, put the price (i.e., interest rate) on the vertical axis.
 
 On the horizontal axis put aggregate capital, computed as the mean of the
 stationary distribution given the interest rate.
diff --git a/source/rst/ifp_advanced.rst b/source/rst/ifp_advanced.rst
index 28153f7..cdfa131 100644
--- a/source/rst/ifp_advanced.rst
+++ b/source/rst/ifp_advanced.rst
@@ -13,7 +13,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 
@@ -47,7 +47,8 @@ We require the following imports:
     import numpy as np
     from quantecon.optimize import brent_max, brentq
     from interpolation import interp
-    from numba import njit, float64, jitclass
+    from numba import njit, float64
+    from numba.experimental import jitclass
     import matplotlib.pyplot as plt
     %matplotlib inline
     from quantecon import MarkovChain
@@ -248,7 +249,7 @@ It can be shown that
 
 We now have a clear path to successfully approximating the optimal policy:
 choose some :math:`\sigma \in \mathscr C` and then iterate with :math:`K` until
-convergence (as measured by the distance :math:`\rho`)
+convergence (as measured by the distance :math:`\rho`).
 
 
 
@@ -334,7 +335,7 @@ radius of the matrix :math:`L` defined by
 
     L(z, \hat z) := P(z, \hat z) \int R(\hat z, x) \phi(x) dx
 
-This indentity is proved in :cite:`ma2020income`, where :math:`\phi` is the
+This identity is proved in :cite:`ma2020income`, where :math:`\phi` is the
 density of the innovation :math:`\zeta_t` to returns on assets.  
 
 (Remember that :math:`\mathsf Z` is a finite set, so this expression defines a matrix.)
@@ -641,7 +642,7 @@ For example, we will pass in the solutions ``a_star, σ_star`` along with
 ``ifp``, even though it would be more natural to just pass in ``ifp`` and then
 solve inside the function.
 
-The reason we do this is because ``solve_model_time_iter`` is not
+The reason we do this is that ``solve_model_time_iter`` is not
 JIT-compiled.
 
 
diff --git a/source/rst/index_tools_and_techniques.rst b/source/rst/index_tools_and_techniques.rst
index 05dd4d6..cd50303 100644
--- a/source/rst/index_tools_and_techniques.rst
+++ b/source/rst/index_tools_and_techniques.rst
@@ -24,3 +24,5 @@ tools and techniques
     complex_and_trig
     lln_clt
     heavy_tails
+    multivariate_normal
+    time_series_with_matrices
diff --git a/source/rst/inventory_dynamics.rst b/source/rst/inventory_dynamics.rst
index 1b2e2bd..0ab0946 100644
--- a/source/rst/inventory_dynamics.rst
+++ b/source/rst/inventory_dynamics.rst
@@ -21,7 +21,7 @@ follow so-called s-S inventory dynamics.
 Such firms 
 
 1. wait until inventory falls below some level :math:`s` and then
-2. order sufficent quantities to bring their inventory back up to capacity :math:`S`.
+2. order sufficient quantities to bring their inventory back up to capacity :math:`S`.
 
 These kinds of policies are common in practice and also optimal in certain circumstances.
 
@@ -39,7 +39,8 @@ Let's start with some imports
     import matplotlib.pyplot as plt
     %matplotlib inline
 
-    from numba import njit, jitclass, float64, prange
+    from numba import njit, float64, prange
+    from numba.experimental import jitclass
 
 
 Sample Paths 
@@ -170,7 +171,7 @@ fixed :math:`T`.
 We will do this by generating many draws of :math:`X_T` given initial
 condition :math:`X_0`.
 
-With these draws of :math:`X_T` we can build up a picture of its distribution :math:`\psi_T` 
+With these draws of :math:`X_T` we can build up a picture of its distribution :math:`\psi_T`. 
 
 Here's one visualization, with :math:`T=50`.
 
diff --git a/source/rst/jv.rst b/source/rst/jv.rst
index bfe3abe..a858fb4 100644
--- a/source/rst/jv.rst
+++ b/source/rst/jv.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 Overview
@@ -224,7 +224,7 @@ We will set up a class ``JVWorker`` that holds the parameters of the model descr
 
 
 The function ``operator_factory`` takes an instance of this class and returns a
-jitted version of the Bellman operator ``T``, ie.
+jitted version of the Bellman operator ``T``, i.e.
 
 .. math::
 
diff --git a/source/rst/kalman.rst b/source/rst/kalman.rst
index 6732ac2..fde55cd 100644
--- a/source/rst/kalman.rst
+++ b/source/rst/kalman.rst
@@ -16,9 +16,9 @@ A First Look at the Kalman Filter
 In addition to what's in Anaconda, this lecture will need the following libraries:
 
 .. code-block:: ipython
-  :class: hide-output
+    :class: hide-output
 
-  !pip install --upgrade quantecon
+    !pip install quantecon
 
 Overview
 ========
@@ -71,8 +71,7 @@ One way to summarize our knowledge is a point prediction :math:`\hat x`
 
 * But what if the President wants to know the probability that the missile is currently over the Sea of Japan?
 * Then it is better to summarize our initial beliefs with a bivariate probability density :math:`p`
-
-    * :math:`\int_E p(x)dx` indicates the probability that we attach to the missile being in region :math:`E`.
+  * :math:`\int_E p(x)dx` indicates the probability that we attach to the missile being in region :math:`E`.
 
 The density :math:`p` is called our *prior* for the random variable :math:`x`.
 
@@ -111,12 +110,9 @@ where :math:`\hat x` is the mean of the distribution and :math:`\Sigma` is a
 
 This density :math:`p(x)` is shown below as a contour map, with the center of the red ellipse being equal to :math:`\hat x`.
 
-
-
 .. code-block:: python3
   :class: collapse
 
-
   # Set up the Gaussian prior density p
   Σ = [[0.4, 0.3], [0.3, 0.45]]
   Σ = np.matrix(Σ)
@@ -195,8 +191,6 @@ This density :math:`p(x)` is shown below as a contour map, with the center of th
   plt.show()
 
 
-
-
 The Filtering Step
 ------------------
 
@@ -207,8 +201,6 @@ The good news is that the missile has been located by our sensors, which report
 The next figure shows the original prior :math:`p(x)` and the new reported
 location :math:`y`
 
-
-
 .. code-block:: python3
 
   fig, ax = plt.subplots(figsize=(10, 8))
@@ -517,7 +509,7 @@ Conditions under which a fixed point exists and the sequence :math:`\{\Sigma_t\}
 
 A sufficient (but not necessary) condition is that all the eigenvalues :math:`\lambda_i` of :math:`A` satisfy :math:`|\lambda_i| < 1` (cf. e.g., :cite:`AndersonMoore2005`, p. 77).
 
-(This strong condition assures that the unconditional  distribution of :math:`x_t`  converges as :math:`t \rightarrow + \infty`)
+(This strong condition assures that the unconditional  distribution of :math:`x_t`  converges as :math:`t \rightarrow + \infty`.)
 
 In this case, for any initial choice of :math:`\Sigma_0` that is both non-negative and symmetric, the sequence :math:`\{\Sigma_t\}` in :eq:`kalman_sdy` converges to a non-negative symmetric matrix :math:`\Sigma` that solves :eq:`kalman_dare`.
 
@@ -531,7 +523,7 @@ Implementation
 
 
 
-The class ``Kalman`` from the `QuantEcon.py`_ package implements the Kalman filter
+The class ``Kalman`` from the `QuantEcon.py <http://quantecon.org/quantecon-py>`_ package implements the Kalman filter
 
 
 
@@ -539,9 +531,9 @@ The class ``Kalman`` from the `QuantEcon.py`_ package implements the Kalman filt
 
 * Instance data consists of:
 
-    * the moments :math:`(\hat x_t, \Sigma_t)` of the current prior.
+  * the moments :math:`(\hat x_t, \Sigma_t)` of the current prior.
 
-    * An instance of the `LinearStateSpace <https://github.com/QuantEcon/QuantEcon.py/blob/master/quantecon/lss.py>`_ class from `QuantEcon.py <http://quantecon.org/quantecon-py>`_.
+  * An instance of the `LinearStateSpace <https://github.com/QuantEcon/QuantEcon.py/blob/master/quantecon/lss.py>`_ class from `QuantEcon.py <http://quantecon.org/quantecon-py>`_.
 
 
 The latter represents a linear state space model of the form
@@ -570,13 +562,13 @@ To connect this with the notation of this lecture we set
 
 * Methods pertinent for this lecture  are:
 
-    * ``prior_to_filtered``, which updates :math:`(\hat x_t, \Sigma_t)` to :math:`(\hat x_t^F, \Sigma_t^F)`
+  * ``prior_to_filtered``, which updates :math:`(\hat x_t, \Sigma_t)` to :math:`(\hat x_t^F, \Sigma_t^F)`
 
-    * ``filtered_to_forecast``, which updates the filtering distribution to the predictive distribution -- which becomes the new prior :math:`(\hat x_{t+1}, \Sigma_{t+1})`
+  * ``filtered_to_forecast``, which updates the filtering distribution to the predictive distribution -- which becomes the new prior :math:`(\hat x_{t+1}, \Sigma_{t+1})`
 
-    * ``update``, which combines the last two methods
+  * ``update``, which combines the last two methods
 
-    * a ``stationary_values``, which computes the solution to :eq:`kalman_dare` and the corresponding (stationary) Kalman gain
+  * a ``stationary_values``, which computes the solution to :eq:`kalman_dare` and the corresponding (stationary) Kalman gain
 
 
 You can view the program `on GitHub <https://github.com/QuantEcon/QuantEcon.py/blob/master/quantecon/kalman.py>`__.
diff --git a/source/rst/kesten_processes.rst b/source/rst/kesten_processes.rst
index be57353..496dbb7 100644
--- a/source/rst/kesten_processes.rst
+++ b/source/rst/kesten_processes.rst
@@ -16,7 +16,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install --upgrade yfinance
 
 
@@ -528,7 +528,7 @@ In this setting, firm dynamics can be expressed as
 
 Here
 
-* the state variable :math:`s_t` is represents productivity (which is a proxy
+* the state variable :math:`s_t` represents productivity (which is a proxy
   for output and hence firm size),
 * the IID sequence :math:`\{ e_t \}` is thought of as a productivity draw for a new
   entrant and
@@ -737,8 +737,11 @@ Now we produce the rank-size plot:
 
     fig, ax = plt.subplots()
     
-    qe.rank_size_plot(data, ax, c=0.01)
-    
+    rank_data, size_data = qe.rank_size(data, c=0.01)
+    ax.loglog(rank_data, size_data, 'o', markersize=3.0, alpha=0.5)
+    ax.set_xlabel("log rank")
+    ax.set_ylabel("log size")
+
     plt.show()
 
 The plot produces a straight line, consistent with a Pareto tail.
diff --git a/source/rst/lake_model.rst b/source/rst/lake_model.rst
index 2fa84d8..aa14a04 100644
--- a/source/rst/lake_model.rst
+++ b/source/rst/lake_model.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -68,8 +68,8 @@ Let's start with some imports:
 Prerequisites
 -------------
 
-Before working through what follows, we recommend you read the :doc:`lecture
-on finite Markov chains <finite_markov>`.
+Before working through what follows, we recommend you read the
+:doc:`lecture on finite Markov chains <finite_markov>`.
 
 You will also need some basic :doc:`linear algebra <linear_algebra>` and probability.
 
@@ -415,7 +415,7 @@ there exists an :math:`\bar x`  such that
 
 This equation tells us that a steady state level :math:`\bar x` is an  eigenvector of :math:`\hat A` associated with a unit eigenvalue.
 
-We also have :math:`x_t \to \bar x` as :math:`t \to \infty` provided that the remaining eigenvalue of :math:`\hat A` has modulus less that 1.
+We also have :math:`x_t \to \bar x` as :math:`t \to \infty` provided that the remaining eigenvalue of :math:`\hat A` has modulus less than 1.
 
 This is the case for our default parameters:
 
diff --git a/source/rst/likelihood_bayes.rst b/source/rst/likelihood_bayes.rst
index e2d9408..a5fcaab 100644
--- a/source/rst/likelihood_bayes.rst
+++ b/source/rst/likelihood_bayes.rst
@@ -38,7 +38,7 @@ We'll study how, at least  in our setting, a Bayesian eventually learns the prob
 rests on the asymptotic behavior of likelihood ratio processes studied in :doc:`this lecture <likelihood_ratio_process>`. 
 
 This lecture provides technical results that underly outcomes to be studied in :doc:`this lecture <odu>`
-and :doc:`this lecture <wald_friedman>` and :doc:`this lecture <navy_captain>`
+and :doc:`this lecture <wald_friedman>` and :doc:`this lecture <navy_captain>`.
 
 
 The Setting
@@ -50,15 +50,14 @@ We begin by reviewing the setting in :doc:`this lecture <likelihood_ratio_proces
 A nonnegative random variable :math:`W` has one of two probability density functions, either
 :math:`f` or :math:`g`.
 
-Before the beginning of time, nature once and for all decides whether she will draw a sequence of IID draws from either
-:math:`f` or :math:`g`.
+Before the beginning of time, nature once and for all decides whether she will draw a sequence of IID draws from :math:`f` or from :math:`g`.
 
 We will sometimes let :math:`q` be the density that nature chose once and for all, so
 that :math:`q` is either :math:`f` or :math:`g`, permanently.
 
 Nature knows which density it permanently draws from, but we the observers do not.
 
-We do know both :math:`f` and :math:`g` but we don’t know which density nature
+We do know both :math:`f` and :math:`g`, but we don’t know which density nature
 chose.
 
 But we want to know.
@@ -263,7 +262,7 @@ and the initial prior :math:`\pi_{0}`
 
    \pi_{t+1}=\frac{\pi_{0}L\left(w^{t+1}\right)}{\pi_{0}L\left(w^{t+1}\right)+1-\pi_{0}} .
 
-Formula :eq:`eq_Bayeslaw103` generalizes generalizes formula :eq:`eq_recur1`.
+Formula :eq:`eq_Bayeslaw103` generalizes formula :eq:`eq_recur1`.
 
 Formula :eq:`eq_Bayeslaw103`  can be regarded as a one step  revision of prior probability :math:`\pi_0` after seeing
 the batch of data :math:`\left\{ w_{i}\right\} _{i=1}^{t+1}`.
@@ -277,8 +276,7 @@ limiting behavior of :math:`\pi_t`.
 
 To illustrate this insight, below we will plot  graphs showing **one** simulated
 path of the  likelihood ratio process :math:`L_t` along with two paths of
-:math:`\pi_t` that are associated with the *same* realization of the likelihood ratio process but *different* initial prior probabilities
-probabilities :math:`\pi_{0}`.
+:math:`\pi_t` that are associated with the *same* realization of the likelihood ratio process but *different* initial prior probabilities :math:`\pi_{0}`.
 
 First, we tell Python two values of :math:`\pi_0`.
 
@@ -380,4 +378,4 @@ Sequels
 This lecture has been devoted to building some useful infrastructure.
 
 We'll build on results highlighted in this lectures to understand inferences that are the foundations of
-results described  in :doc:`this lecture <odu>` and :doc:`this lecture <wald_friedman>` and :doc:`this lecture <navy_captain>`
+results described  in :doc:`this lecture <odu>` and :doc:`this lecture <wald_friedman>` and :doc:`this lecture <navy_captain>`.
diff --git a/source/rst/likelihood_ratio_process.rst b/source/rst/likelihood_ratio_process.rst
index 906f17d..b80c319 100644
--- a/source/rst/likelihood_ratio_process.rst
+++ b/source/rst/likelihood_ratio_process.rst
@@ -18,23 +18,25 @@ Likelihood Ratio Processes
     from numba import vectorize, njit
     from math import gamma
     %matplotlib inline
+    from scipy.integrate import quad
 
 
 Overview
 =========
+
 This lecture describes likelihood ratio processes and some of their uses.
 
-We'll use the simple statistical setting also used in :doc:`this lecture <exchangeable>`.
+We'll use a setting described in :doc:`this lecture <exchangeable>`.
 
-Among the things that we'll learn about are
+Among  things that we'll learn  are
 
-   * A peculiar property of likelihood ratio processes
+* A peculiar property of likelihood ratio processes
 
-   * How a likelihood ratio process is the key ingredient in frequentist hypothesis testing
+* How a likelihood ratio process is a key ingredient in frequentist hypothesis testing
 
-   * How a **receiver operator characteristic curve** summarizes information about a false alarm probability and power in frequentist hypothesis testing
+* How a **receiver operator characteristic curve** summarizes information about a false alarm probability and power in frequentist hypothesis testing
 
-   * How during World War II the United States Navy devised a decision rule that Captain Garret L. Schyler challenged and asked Milton Friedman to justify to him, a topic to be studied in  :doc:`this lecture <wald_friedman>`
+* How during World War II the United States Navy devised a decision rule that Captain Garret L. Schyler challenged and asked Milton Friedman to justify to him, a topic to be studied in  :doc:`this lecture <wald_friedman>`
 
 Likelihood Ratio Process
 ========================
@@ -107,8 +109,8 @@ Pearson :cite:`Neyman_Pearson`.
 
 To help us appreciate how things work, the following Python code evaluates :math:`f` and :math:`g` as two different
 beta distributions, then computes and simulates an associated likelihood
-ratio process by generating a sequence :math:`w^t` from *some*
-probability distribution, for example, a sequence of  IID draws from :math:`g`.
+ratio process by generating a sequence :math:`w^t` from one of the two 
+probability distributionss, for example, a sequence of  IID draws from :math:`g`.
 
 .. code-block:: python3
 
@@ -247,12 +249,20 @@ averaging across these many paths at each :math:`t`.
     l_arr_g = simulate(G_a, G_b, N=50000)
     l_seq_g = np.cumprod(l_arr_g, axis=1)
 
-The following Python code approximates  unconditional means
-:math:`E_{0}\left[L\left(w^{t}\right)\right]` by averaging across sample
+It would be useful to use simulations to verify that  unconditional means
+:math:`E_{0}\left[L\left(w^{t}\right)\right]` equal unity by averaging across sample
 paths.
 
-Please notice that while  sample averages  hover around their population means of :math:`1`, there is quite a bit
-of variability, a consequence of the *fat tail* of the distribution of  :math:`L\left(w^{t}\right)`.
+But it would be too challenging for us to that  here simply by applying a standard Monte Carlo simulation approach.
+
+The reason is that the distribution of :math:`L\left(w^{t}\right)` is extremely skewed for large values of  :math:`t`.
+
+Because the probability density in the right tail is close to :math:`0`,  it just takes too much computer time to sample enough points from the right tail.
+
+Instead, the following code just illustrates that the unconditional means of :math:`l(w_t)` are :math:`1`.  
+
+While   sample averages  hover around their population means of :math:`1`, there is evidently  quite a bit
+of variability.  
 
 .. code-block:: python3
 
@@ -310,6 +320,7 @@ fast probability mass diverges  to :math:`+\infty`.
 
 
 
+
 Likelihood Ratio Test
 ======================
 
@@ -321,7 +332,7 @@ Denote :math:`q` as the data generating process, so that
 :math:`q=f \text{ or } g`.
 
 Upon observing a sample :math:`\{W_i\}_{i=1}^t`, we want to decide
-which one is the data generating process by performing  a (frequentist)
+whether nature is drawing from :math:`g` or from :math:`f` by performing  a (frequentist)
 hypothesis test.
 
 We specify
@@ -340,26 +351,24 @@ where :math:`c` is a given  discrimination threshold, to be chosen in a way we'l
 This test is *best* in the sense that it is a **uniformly most powerful** test.
 
 To understand what this means, we have to define probabilities of two important events that 
-allow us to characterize a test associated with given
+allow us to characterize a test associated with a given
 threshold :math:`c`.
 
-The two probabities are:
+The two probabilities are:
 
 - Probability of detection (= power = 1 minus probability
-   of Type II error):
-
-.. math::
+  of Type II error):
 
+  .. math::
 
-   1-\beta \equiv \Pr\left\{ L\left(w^{t}\right)<c\mid q=g\right\}
+       1-\beta \equiv \Pr\left\{ L\left(w^{t}\right)<c\mid q=g\right\}
 
 - Probability of false alarm (= significance level = probability of
-   Type I error):
-
-.. math::
+  Type I error):
 
+  .. math::
 
-   \alpha \equiv  \Pr\left\{ L\left(w^{t}\right)<c\mid q=f\right\}
+      \alpha \equiv  \Pr\left\{ L\left(w^{t}\right)<c\mid q=f\right\}
 
 The `Neyman-Pearson
 Lemma <https://en.wikipedia.org/wiki/Neyman–Pearson_lemma>`__
@@ -371,7 +380,7 @@ Another way to say the same thing is that  among all possible tests, a likelihoo
 maximizes **power** for a given **significance level**. 
 
 
-To have made a confident inference, we want a small probability of
+To have made a good inference, we want a small probability of
 false alarm and a large probability of detection.
 
 With sample size :math:`t` fixed, we can change our two probabilities by
@@ -413,7 +422,8 @@ moves toward :math:`-\infty` when :math:`g` is the data generating
 process, ; while log\ :math:`(L(w^t))` goes to
 :math:`\infty` when data are generated by :math:`f`.
 
-This diverse behavior is what makes it possible to distinguish
+That disparate  behavior of log\ :math:`(L(w^t))` under :math:`f` and :math:`q`
+is what makes it possible to distinguish
 :math:`q=f` from :math:`q=g`.
 
 .. code-block:: python3
@@ -469,9 +479,9 @@ If for a fixed :math:`t` we now free up and move :math:`c`, we will sweep out th
 of detection as a function of the probability of false alarm.
 
 This produces what is called a `receiver operating characteristic
-curve <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`__ for a given discrimination threshold :math:`c`.
+curve <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`__.
 
-Below, we plot receiver operating characteristic curves for a given discrimination threshold :math:`c` but different
+Below, we plot receiver operating characteristic curves for different
 sample sizes :math:`t`.
 
 .. code-block:: python3
@@ -499,10 +509,10 @@ Notice that as :math:`t` increases, we are assured a larger probability
 of detection and a smaller probability of false alarm associated with
 a given discrimination threshold :math:`c`.
 
-As :math:`t \rightarrow + \infty`, we approach the the perfect detection
-curve that is indicated by a right angle hinging on the green dot.
+As :math:`t \rightarrow + \infty`, we approach the perfect detection
+curve that is indicated by a right angle hinging on the blue dot.
 
-For a given sample size :math:`t`, a value discrimination threshold :math:`c` determines a point on the receiver operating 
+For a given sample size :math:`t`, the discrimination threshold :math:`c` determines a point on the receiver operating 
 characteristic curve.
 
 It is up to the test designer to trade off probabilities of
@@ -542,10 +552,165 @@ target probability of detection, for example, :math:`0.9`, as depicted in the fo
 The United States Navy evidently used a procedure like this to select a sample size :math:`t` for doing quality
 control tests during World War II.  
 
-A Navy Captain who had been ordered to perform tests of this kind had second thoughts about it that he
+A Navy Captain who had been ordered to perform tests of this kind had doubts about it that he
 presented to Milton Friedman, as we describe in  :doc:`this lecture <wald_friedman>`.
 
 
+
+
+
+Kullback–Leibler divergence
+============================
+
+Now let’s consider a case in which neither :math:`g` nor :math:`f`
+generates the data.
+
+Instead, a third distribution :math:`h` does.
+
+Let’s watch how how the cumulated likelihood ratios :math:`f/g` behave
+when :math:`h` governs the data.
+
+A key tool here is called **Kullback–Leibler divergence**.
+
+It is also called **relative entropy**.
+
+It measures how one probability distribution differs from another.
+
+In our application, we want to measure how :math:`f` or :math:`g`
+diverges from :math:`h`
+
+The two Kullback–Leibler divergences pertinent for us are :math:`K_f`
+and :math:`K_g` defined as
+
+.. math::
+
+
+   \begin{aligned}
+   K_{f}   &=E_{h}\left[\log\left(\frac{f\left(w\right)}{h\left(w\right)}\right)\frac{f\left(w\right)}{h\left(w\right)}\right] \\
+       &=\int\log\left(\frac{f\left(w\right)}{h\left(w\right)}\right)\frac{f\left(w\right)}{h\left(w\right)}h\left(w\right)dw \\
+       &=\int\log\left(\frac{f\left(w\right)}{h\left(w\right)}\right)f\left(w\right)dw
+   \end{aligned}
+
+.. math::
+
+
+   \begin{aligned}
+   K_{g}   &=E_{h}\left[\log\left(\frac{g\left(w\right)}{h\left(w\right)}\right)\frac{g\left(w\right)}{h\left(w\right)}\right] \\
+       &=\int\log\left(\frac{g\left(w\right)}{h\left(w\right)}\right)\frac{g\left(w\right)}{h\left(w\right)}h\left(w\right)dw \\
+       &=\int\log\left(\frac{g\left(w\right)}{h\left(w\right)}\right)g\left(w\right)dw
+   \end{aligned}
+
+When :math:`K_g < K_f`, :math:`g` is closer to :math:`h` than :math:`f`
+is.
+
+-  In that case we’ll find that :math:`L\left(w^t\right) \rightarrow 0`.
+
+When :math:`K_g > K_f`, :math:`f` is closer to :math:`h` than :math:`g`
+is.
+
+-  In that case we’ll find that
+   :math:`L\left(w^t\right) \rightarrow + \infty`
+
+We’ll now experiment with an :math:`h` is also a beta distribution
+
+We’ll start by setting parameters :math:`G_a` and :math:`G_b` so that
+:math:`h` is closer to :math:`g`
+
+.. code-block:: python3
+
+    H_a, H_b = 3.5, 1.8
+    
+    h = njit(lambda x: p(x, H_a, H_b))
+
+.. code-block:: python3
+
+    x_range = np.linspace(0, 1, 100)
+    plt.plot(x_range, f(x_range), label='f')
+    plt.plot(x_range, g(x_range), label='g')
+    plt.plot(x_range, h(x_range), label='h')
+    
+    plt.legend()
+    plt.show()
+
+Let’s compute the Kullback–Leibler discrepancies by quadrature
+integration.
+
+.. code-block:: python3
+
+    def KL_integrand(w, q, h):
+    
+        m = q(w) / h(w)
+    
+        return np.log(m) * q(w)
+
+.. code-block:: python3
+
+    def compute_KL(h, f, g):
+    
+        Kf, _ = quad(KL_integrand, 0, 1, args=(f, h))
+        Kg, _ = quad(KL_integrand, 0, 1, args=(g, h))
+    
+        return Kf, Kg
+
+.. code-block:: python3
+
+    Kf, Kg = compute_KL(h, f, g)
+    Kf, Kg
+
+We have :math:`K_g < K_f`.
+
+Next, we can verify our conjecture about :math:`L\left(w^t\right)` by
+simulation.
+
+.. code-block:: python3
+
+    l_arr_h = simulate(H_a, H_b)
+    l_seq_h = np.cumprod(l_arr_h, axis=1)
+
+The figure below plots over time the fraction of paths
+:math:`L\left(w^t\right)` that fall in the interval :math:`[0,0.01]`.
+
+Notice that it converges to 1 as expected when :math:`g` is closer to
+:math:`h` than :math:`f` is.
+
+.. code-block:: python3
+
+    N, T = l_arr_h.shape
+    plt.plot(range(T), np.sum(l_seq_h <= 0.01, axis=0) / N)
+
+We can also try an :math:`h` that is closer to :math:`f` than is
+:math:`g` so that now :math:`K_g` is larger than :math:`K_f`.
+
+
+.. code-block:: python3
+
+    H_a, H_b = 1.2, 1.2
+    h = njit(lambda x: p(x, H_a, H_b))
+
+
+.. code-block:: python3
+
+    Kf, Kg = compute_KL(h, f, g)
+    Kf, Kg
+
+
+.. code-block:: python3
+
+    l_arr_h = simulate(H_a, H_b)
+    l_seq_h = np.cumprod(l_arr_h, axis=1)
+
+Now probability mass of :math:`L\left(w^t\right)` falling above
+:math:`10000` diverges to :math:`+\infty`.
+
+.. code-block:: python3
+
+    N, T = l_arr_h.shape
+    plt.plot(range(T), np.sum(l_seq_h > 10000, axis=0) / N)
+
+
+
+
+
 Sequels
 ========
 
diff --git a/source/rst/linear_algebra.rst b/source/rst/linear_algebra.rst
index 2f6a1e3..dbcca91 100644
--- a/source/rst/linear_algebra.rst
+++ b/source/rst/linear_algebra.rst
@@ -204,7 +204,7 @@ Scalar multiplication is illustrated in the next figure
   plt.show()
 
 In Python, a vector can be represented as a list or tuple, such as ``x = (2, 4, 6)``, but is more commonly 
-represented as a `NumPy array <http://python-programming.quantecon.org/numpy.html#NumPy-Arrays>`__.
+represented as a `NumPy array <https://python-programming.quantecon.org/numpy.html#numpy-arrays>`__.
 
 One advantage of NumPy arrays is that scalar multiplication and addition have very natural syntax
 
@@ -652,7 +652,7 @@ You can create them manually from tuples of tuples (or lists of lists) as follow
 
 
 The ``shape`` attribute is a tuple giving the number of rows and columns ---
-see `here <https://python-programming.quantecon.org/numpy.html#Shape-and-Dimension>`__ 
+see `here <https://python-programming.quantecon.org/numpy.html#shape-and-dimension>`__ 
 for more discussion.
 
 To get the transpose of ``A``, use ``A.transpose()`` or, more simply, ``A.T``.
@@ -660,7 +660,7 @@ To get the transpose of ``A``, use ``A.transpose()`` or, more simply, ``A.T``.
 
 
 There are many convenient functions for creating common matrices (matrices of zeros, 
-ones, etc.) --- see `here <https://python-programming.quantecon.org/numpy.html#Creating-Arrays>`__.
+ones, etc.) --- see `here <https://python-programming.quantecon.org/numpy.html#creating-arrays>`__.
 
 
 
@@ -683,7 +683,7 @@ To multiply matrices we use the ``@`` symbol.
 
 In particular, ``A @ B`` is matrix multiplication, whereas ``A * B`` is element-by-element multiplication.
 
-See `here <https://python-programming.quantecon.org/numpy.html#Matrix-Multiplication>`__ for more discussion.
+See `here <https://python-programming.quantecon.org/numpy.html#matrix-multiplication>`__ for more discussion.
 
 
 
@@ -1414,8 +1414,7 @@ The associated Lagrangian is:
 
     L = -y'Py - u'Qu + \lambda' \lbrack Ax + Bu - y \rbrack
 
-1.
-^^.
+**Step 1.**
 
 Differentiating Lagrangian equation w.r.t y and setting its derivative
 equal to zero yields
@@ -1432,8 +1431,7 @@ Accordingly, the first-order condition for maximizing L w.r.t. y implies
 
     \lambda = -2 Py \:
 
-2.
-^^.
+**Step 2.**
 
 Differentiating Lagrangian equation w.r.t. u and setting its derivative
 equal to zero yields
@@ -1459,7 +1457,7 @@ equation gives
 
     (Q + B'PB)u + B'PAx = 0
 
-which is the first-order condition for maximizing L w.r.t. u.
+which is the first-order condition for maximizing :math:`L` w.r.t. :math:`u`.
 
 Thus, the optimal choice of u must satisfy
 
@@ -1470,8 +1468,7 @@ Thus, the optimal choice of u must satisfy
 which follows from the definition of the first-order conditions for
 Lagrangian equation.
 
-3.
-^^.
+**Step 3.**
 
 Rewriting our problem by substituting the constraint into the objective
 function, we get
@@ -1546,7 +1543,7 @@ Therefore, the solution to the optimization problem
 
 
 .. [#fn_mdt] Although there is a specialized matrix data type defined in NumPy, it's more standard to work with ordinary NumPy arrays.  
-    See `this discussion <https://python-programming.quantecon.org/numpy.html#Matrix-Multiplication>`__.
+    See `this discussion <https://python-programming.quantecon.org/numpy.html#matrix-multiplication>`__.
 
 
 .. [#cfn] Suppose that :math:`\|S \| < 1`. Take any nonzero vector :math:`x`, and let :math:`r := \|x\|`. We have :math:`\| Sx \| = r \| S (x/r) \| \leq r \| S \| < r = \| x\|`. Hence every point is pulled towards the origin.
diff --git a/source/rst/linear_models.rst b/source/rst/linear_models.rst
index 7f96fc3..d15c07e 100644
--- a/source/rst/linear_models.rst
+++ b/source/rst/linear_models.rst
@@ -22,7 +22,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -41,24 +41,23 @@ Its many applications include:
 
 * predicting a geometric sum of future values of a variable like
 
-    * non-financial income
+  * non-financial income
 
-    * dividends on a stock
+  * dividends on a stock
 
-    * the money supply
-
-    * a government deficit or surplus, etc.
+  * the money supply
 
+  * a government deficit or surplus, etc.
 
 * key ingredient of useful models
 
-    * Friedman's permanent income model of consumption smoothing.
+  * Friedman's permanent income model of consumption smoothing.
 
-    * Barro's model of smoothing total tax collections.
+  * Barro's model of smoothing total tax collections.
 
-    * Rational expectations version of Cagan's model of hyperinflation.
+  * Rational expectations version of Cagan's model of hyperinflation.
 
-    * Sargent and Wallace's "unpleasant monetarist arithmetic," etc.
+  * Sargent and Wallace's "unpleasant monetarist arithmetic," etc.
 
 Let's start with some imports:
 
@@ -1491,7 +1490,7 @@ The code implements a class for handling linear state space models (simulations,
 
 One Python construct you might not be familiar with is the use of a generator function in the method ``moment_sequence()``.
 
-Go back and `read the relevant documentation <https://python-programming.quantecon.org/python_advanced_features.html#Generators>`__ if you've forgotten how generator functions work.
+Go back and `read the relevant documentation <https://python-programming.quantecon.org/python_advanced_features.html#generators>`__ if you've forgotten how generator functions work.
 
 
 
diff --git a/source/rst/lln_clt.rst b/source/rst/lln_clt.rst
index 15e2491..8a020d5 100644
--- a/source/rst/lln_clt.rst
+++ b/source/rst/lln_clt.rst
@@ -419,7 +419,7 @@ To this end, we now perform the following simulation
 Here's some code that does exactly this for the exponential distribution
 :math:`F(x) = 1 - e^{- \lambda x}`.
 
-(Please experiment with other choices of :math:`F`, but remember that, to conform with the conditions of the CLT, the distribution must have a finite second moment)
+(Please experiment with other choices of :math:`F`, but remember that, to conform with the conditions of the CLT, the distribution must have a finite second moment.)
 
 .. _sim_one:
 
@@ -482,7 +482,7 @@ random variable, the distribution of :math:`Y_n` will smooth out into a bell-sha
 The next figure shows this process for :math:`X_i \sim f`, where :math:`f` was
 specified as the convex combination of three different beta densities.
 
-(Taking a convex combination is an easy way to produce an irregular shape for :math:`f`)
+(Taking a convex combination is an easy way to produce an irregular shape for :math:`f`.)
 
 In the figure, the closest density is that of :math:`Y_1`, while the furthest is that of
 :math:`Y_5`
@@ -715,7 +715,7 @@ If :math:`g \colon \mathbb R \to \mathbb R` is differentiable at :math:`\mu` and
 
 This theorem is used frequently in statistics to obtain the asymptotic distribution of estimators --- many of which can be expressed as functions of sample means.
 
-(These kinds of results are often said to use the "delta method")
+(These kinds of results are often said to use the "delta method".)
 
 The proof is based on a Taylor expansion of :math:`g` around the point :math:`\mu`.
 
@@ -816,7 +816,7 @@ Given the distribution of :math:`\mathbf Z`, we conclude that
 where :math:`\chi^2(k)` is the chi-squared distribution with :math:`k` degrees
 of freedom.
 
-(Recall that :math:`k` is the dimension of :math:`\mathbf X_i`, the underlying random vectors)
+(Recall that :math:`k` is the dimension of :math:`\mathbf X_i`, the underlying random vectors.)
 
 Your second exercise is to illustrate the convergence in :eq:`lln_ctc` with a simulation.
 
diff --git a/source/rst/lq_inventories.rst b/source/rst/lq_inventories.rst
index 4432d49..cd30b48 100644
--- a/source/rst/lq_inventories.rst
+++ b/source/rst/lq_inventories.rst
@@ -17,13 +17,14 @@ In addition to what's in Anaconda, this lecture employs the following library:
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 Overview
 =========
 
-This lecture can be viewed as an application of the :doc:`quantecon lecture<lqcontrol>`.
+This lecture can be viewed as an application of this :doc:`quantecon lecture<lqcontrol>` about linear quadratic control
+theory.
 
 It formulates a discounted dynamic program for a firm that
 chooses a production schedule to balance
@@ -35,19 +36,19 @@ chooses a production schedule to balance
 In the tradition of a classic book by Holt, Modigliani, Muth, and
 Simon :cite:`Holt_Modigliani_Muth_Simon`, we simplify the
 firm’s problem by formulating it as a linear quadratic discounted
-dynamic programming problem of the type studied in this :doc:`quantecon<lqcontrol>`.
+dynamic programming problem of the type studied in this :doc:`quantecon lecture<lqcontrol>`.
 
 Because its costs of production are increasing and quadratic in
-production, the firm wants to smooth production across time provided
+production, the firm holds inventories as a buffer stock in order to smooth production across time, provided
 that holding inventories is not too costly.
 
-But the firm also prefers to sell out of existing inventories, a
+But the firm also wants to make its sales  out of existing inventories, a
 preference that we represent by a cost that is quadratic in the
 difference between sales in a period and the firm’s beginning of period
 inventories.
 
-We compute examples designed to indicate how the firm optimally chooses
-to smooth production and manage inventories while keeping inventories
+We compute examples designed to indicate how the firm optimally 
+smooths production  while keeping inventories
 close to sales.
 
 To introduce components of the model, let
@@ -62,7 +63,7 @@ To introduce components of the model, let
    :math:`d_1>0, d_2 >0`, be a cost-of-holding-inventories function,
    consisting of two components:
 
-   -  a cost :math:`d_1 t` of carrying inventories, and
+   -  a cost :math:`d_1 I_t` of carrying inventories, and
    -  a cost :math:`d_2 (S_t - I_t)^2` of having inventories deviate
       from sales
 
@@ -75,7 +76,7 @@ To introduce components of the model, let
    be the present value of the firm’s profits at
    time :math:`0`
 -  :math:`I_{t+1} = I_t + Q_t - S_t` be the law of motion of inventories
--  :math:`z_{t+1} = A_{22} z_t + C_2 \epsilon_{t+1}` be the law
+-  :math:`z_{t+1} = A_{22} z_t + C_2 \epsilon_{t+1}` be a law
    of motion for an exogenous state vector :math:`z_t` that contains
    time :math:`t` information useful for predicting the demand shock
    :math:`v_t`
@@ -118,20 +119,19 @@ appears in the firm’s one-period profit function)
 
 We can express the firm’s profit as a function of states and controls as
 
-.. math::  \pi_t =  - (x_t' R x_t + u_t' Q u_t + 2 u_t' H x_t ) 
+.. math::  \pi_t =  - (x_t' R x_t + u_t' Q u_t + 2 u_t' N x_t ) 
 
-To form the matrices :math:`R, Q, H`, we note that the firm’s profits at
+To form the matrices :math:`R, Q, N` in an LQ dynamic programming problem, we note that the firm’s profits at
 time :math:`t` function can be expressed
 
 .. math::
 
-
-   \begin{equation}
-   \begin{split}
+   \begin{aligned}
    \pi_{t} =&p_{t}S_{t}-c\left(Q_{t}\right)-d\left(I_{t},S_{t}\right)  \\
        =&\left(a_{0}-a_{1}S_{t}+v_{t}\right)S_{t}-c_{1}Q_{t}-c_{2}Q_{t}^{2}-d_{1}I_{t}-d_{2}\left(S_{t}-I_{t}\right)^{2}  \\
        =&a_{0}S_{t}-a_{1}S_{t}^{2}+Gz_{t}S_{t}-c_{1}Q_{t}-c_{2}Q_{t}^{2}-d_{1}I_{t}-d_{2}S_{t}^{2}-d_{2}I_{t}^{2}+2d_{2}S_{t}I_{t}  \\
-       =&-\left(\underset{x_{t}^{\prime}Rx_{t}}{\underbrace{d_{1}I_{t}+d_{2}I_{t}^{2}}}\underset{u_{t}^{\prime}Qu_{t}}{\underbrace{+a_{1}S_{t}^{2}+d_{2}S_{t}^{2}+c_{2}Q_{t}^{2}}}\underset{2u_{t}^{\prime}Hx_{t}}{\underbrace{-a_{0}S_{t}-Gz_{t}S_{t}+c_{1}Q_{t}-2d_{2}S_{t}I_{t}}}\right) \\
+       =&-\left(\underset{x_{t}^{\prime}Rx_{t}}{\underbrace{d_{1}I_{t}+d_{2}I_{t}^{2}}}\underset{u_{t}^{\prime}Qu_{t}}{\underbrace{+a_{1}S_{t}^{2}+d_{2}S_{t}^{2}+c_{2}Q_{t}^{2}}}
+       \underset{2u_{t}^{\prime}N x_{t}}{\underbrace{-a_{0}S_{t}-Gz_{t}S_{t}+c_{1}Q_{t}-2d_{2}S_{t}I_{t}}}\right) \\
        =&-\left(\left[\begin{array}{cc}
    I_{t} & z_{t}^{\prime}\end{array}\right]\underset{\equiv R}{\underbrace{\left[\begin{array}{cc}
    d_{2} & \frac{d_{1}}{2}S_{c}\\
@@ -154,13 +154,12 @@ time :math:`t` function can be expressed
    I_{t}\\
    z_{t}
    \end{array}\right]\right)
-   \end{split}
-   \end{equation}
+   \end{aligned}
 
 where :math:`S_{c}=\left[1,0\right]`.
 
 **Remark on notation:** The notation for cross product term in the
-QuantEcon library is :math:`N` instead of :math:`H`.
+QuantEcon library is :math:`N`.
 
 The firms’ optimum decision rule takes the form
 
@@ -170,6 +169,15 @@ and the evolution of the state under the optimal decision rule is
 
 .. math::  x_{t+1} = (A - BF ) x_t + C \epsilon_{t+1} 
 
+
+The firm chooses a decision rule for :math:`u_t` that maximizes
+
+.. math:: E_0 \sum_{t=0}^\infty \beta^t \pi_t 
+
+subject to a given :math:`x_0`.
+
+This is a stochastic discounted LQ dynamic program.
+
 Here is code for computing an optimal decision rule and for analyzing
 its consequences.
 
@@ -315,7 +323,7 @@ its consequences.
 Notice that the above code sets parameters at the following default
 values
 
--  discount factor β=0.96,
+-  discount factor :math:`\beta=0.96`,
 
 -  inverse demand function: :math:`a0=10, a1=1`
 
@@ -469,7 +477,7 @@ We introduce this :math:`I_t` **is hardwired to zero** specification in
 order to shed light on the role that inventories play by comparing outcomes
 with those under our two other versions of the problem.
 
-The bottom right panel displays an production path for the original
+The bottom right panel displays a production path for the original
 problem that we are interested in (the blue line) as well with an
 optimal production path for the model in which inventories are not
 useful (the green path) and also for the model in which, although
diff --git a/source/rst/lqcontrol.rst b/source/rst/lqcontrol.rst
index 391333e..d8a6bde 100644
--- a/source/rst/lqcontrol.rst
+++ b/source/rst/lqcontrol.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -145,7 +145,7 @@ Another alteration that's useful to introduce (we'll see why soon) is to
 change the control variable from consumption
 to the deviation of consumption from some "ideal" quantity :math:`\bar c`.
 
-(Most parameterizations will be such that :math:`\bar c` is large relative to the amount of consumption that is attainable in each period, and hence the household wants to increase consumption)
+(Most parameterizations will be such that :math:`\bar c` is large relative to the amount of consumption that is attainable in each period, and hence the household wants to increase consumption.)
 
 For this reason, we now take our control to be :math:`u_t := c_t - \bar c`.
 
@@ -288,7 +288,7 @@ and :math:`Q` to be identity matrices so that current loss is
 Thus, for both the state and the control, loss is measured as squared distance from the origin.
 
 (In fact, the general case :eq:`lq_pref_flow` can also be understood in this
-way, but with :math:`R` and :math:`Q` identifying other -- non-Euclidean -- notions of "distance" from the zero vector).
+way, but with :math:`R` and :math:`Q` identifying other -- non-Euclidean -- notions of "distance" from the zero vector.)
 
 Intuitively, we can often think of the state :math:`x_t` as representing deviation from a target, such
 as
@@ -542,7 +542,7 @@ and
     d_{T-1} := \beta \mathop{\mathrm{trace}}(C' P_T C)
 
 
-(The algebra is a good exercise --- we'll leave it up to you)
+(The algebra is a good exercise --- we'll leave it up to you.)
 
 If we continue working backwards in this manner, it soon becomes clear that :math:`J_t (x) = x' P_t x + d_t` as claimed, where :math:`\{P_t\}` and :math:`\{d_t\}` satisfy the recursions
 
@@ -603,25 +603,25 @@ are wrapped in a class  called ``LQ``, which includes
 
 * Instance data:
 
-    * The required parameters :math:`Q, R, A, B` and optional parameters `C, β, T, R_f, N` specifying a given LQ model
+  * The required parameters :math:`Q, R, A, B` and optional parameters :math:`C, \beta, T, R_f, N` specifying a given LQ model
 
-        * set :math:`T` and :math:`R_f` to ``None`` in the infinite horizon case
+    * set :math:`T` and :math:`R_f` to ``None`` in the infinite horizon case
 
-        * set ``C = None`` (or zero) in the deterministic case
+    * set ``C = None`` (or zero) in the deterministic case
 
-    * the value function and policy data
+  * the value function and policy data
 
-        * :math:`d_t, P_t, F_t` in the finite horizon case
+    * :math:`d_t, P_t, F_t` in the finite horizon case
 
-        * :math:`d, P, F` in the infinite horizon case
+    * :math:`d, P, F` in the infinite horizon case
 
 * Methods:
 
-    * ``update_values`` --- shifts :math:`d_t, P_t, F_t` to their :math:`t-1` values via :eq:`lq_pr`, :eq:`lq_dd` and :eq:`lq_oc`
+  * ``update_values`` --- shifts :math:`d_t, P_t, F_t` to their :math:`t-1` values via :eq:`lq_pr`, :eq:`lq_dd` and :eq:`lq_oc`
 
-    * ``stationary_values`` --- computes :math:`P, d, F` in the infinite horizon case
+  * ``stationary_values`` --- computes :math:`P, d, F` in the infinite horizon case
 
-    * ``compute_sequence`` ---- simulates the dynamics of :math:`x_t, u_t, w_t` given :math:`x_0` and assuming standard normal shocks
+  * ``compute_sequence`` ---- simulates the dynamics of :math:`x_t, u_t, w_t` given :math:`x_0` and assuming standard normal shocks
 
 .. _lq_mfpa:
 
@@ -636,7 +636,7 @@ Data contradicted the constancy of the marginal propensity to consume.
 In response, Milton Friedman, Franco Modigliani and others built models
 based on a consumer's preference for an intertemporally smooth consumption stream.
 
-(See, for example, :cite:`Friedman1956` or :cite:`ModiglianiBrumberg1954`)
+(See, for example, :cite:`Friedman1956` or :cite:`ModiglianiBrumberg1954`.)
 
 One property of those models is that households purchase and sell financial assets to make consumption streams smoother than income streams.
 
@@ -657,7 +657,7 @@ subject to the sequence of budget constraints :math:`a_{t+1} = (1 + r) a_t - c_t
 
 Here :math:`q` is a large positive constant, the role of which is to induce the consumer to target zero debt at the end of her life.
 
-(Without such a constraint, the optimal choice is to choose :math:`c_t = \bar c` in each period, letting assets adjust accordingly)
+(Without such a constraint, the optimal choice is to choose :math:`c_t = \bar c` in each period, letting assets adjust accordingly.)
 
 As before we set :math:`y_t = \sigma w_{t+1} + \mu` and :math:`u_t := c_t - \bar c`, after which the constraint can be written as in :eq:`lq_lomwc`.
 
@@ -765,7 +765,7 @@ As anticipated by the discussion on consumption smoothing, the time path of
 consumption is much smoother than that for income.
 
 (But note that  consumption becomes more irregular towards the end of life,
-when the zero final asset requirement impinges more on consumption choices).
+when the zero final asset requirement impinges more on consumption choices.)
 
 The second panel in the figure shows that the time path of assets :math:`a_t` is
 closely correlated with cumulative unanticipated income, where the latter is defined as
@@ -778,7 +778,7 @@ closely correlated with cumulative unanticipated income, where the latter is def
 A key message is that unanticipated windfall gains are saved rather
 than consumed, while unanticipated negative shocks are met by reducing assets.
 
-(Again, this relationship breaks down towards the end of life due to the zero final asset requirement)
+(Again, this relationship breaks down towards the end of life due to the zero final asset requirement.)
 
 These results are relatively robust to changes in parameters.
 
@@ -1028,7 +1028,7 @@ subject to :math:`a_{t+1} = (1 + r) a_t - c_t + y_t, \ t \geq 0`.
 
 For income we now take :math:`y_t = p(t) + \sigma w_{t+1}` where :math:`p(t) := m_0 + m_1 t + m_2 t^2`.
 
-(In :ref:`the next section <lq_nsi2>` we employ some tricks to implement a more sophisticated model)
+(In :ref:`the next section <lq_nsi2>` we employ some tricks to implement a more sophisticated model.)
 
 The coefficients :math:`m_0, m_1, m_2` are chosen such that :math:`p(0)=0, p(T/2) = \mu,` and :math:`p(T)=0`.
 
@@ -1189,7 +1189,7 @@ However, we can still use our LQ methods here by suitably linking two-component
 These two LQ problems describe the consumer's behavior during her working life (``lq_working``) and retirement (``lq_retired``).
 
 (This is possible because, in the two separate periods of life, the respective income processes
-[polynomial trend and constant] each fit the LQ framework)
+[polynomial trend and constant] each fit the LQ framework.)
 
 The basic idea is that although the whole problem is not a single time-invariant LQ problem, it is
 still a dynamic programming problem, and hence we can use appropriate Bellman equations at
@@ -1313,7 +1313,7 @@ Let's now replace :math:`\pi_t` in :eq:`lq_object_mp` with :math:`\hat \pi_t :=
 
 This makes no difference to the solution, since :math:`a_1 \bar q_t^2` does not depend on the controls.
 
-(In fact, we are just adding a constant term to :eq:`\lq_object_mp`, and optimizers are not affected by constant terms)
+(In fact, we are just adding a constant term to :eq:`\lq_object_mp`, and optimizers are not affected by constant terms.)
 
 The reason for making this substitution is that, as you will be able to
 verify, :math:`\hat \pi_t` reduces to the simple quadratic
diff --git a/source/rst/markov_asset.rst b/source/rst/markov_asset.rst
index 8c82ede..b751eb5 100644
--- a/source/rst/markov_asset.rst
+++ b/source/rst/markov_asset.rst
@@ -27,7 +27,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 Overview
@@ -142,7 +142,7 @@ The way anticipated future payoffs are evaluated can now depend on various rando
 
 One example of this idea is that assets that tend to have good payoffs in bad states of the world might be regarded as more valuable.
 
-This is because they pay well when the funds are more urgently needed.
+This is because they pay well when funds are more urgently wanted.
 
 We give examples of how the stochastic discount factor has been modeled below.
 
@@ -173,6 +173,10 @@ It is useful to regard equation :eq:`lteeqs102`   as a generalization of equatio
 
 * In equation :eq:`rnapex`, the covariance term :math:`{\rm cov}_t (m_{t+1}, d_{t+1}+ p_{t+1})` is zero because :math:`m_{t+1} = \beta`.
 
+* In equation :eq:`rnapex`, :math:`{\mathbb E}_t m_{t+1}` can be interpreted as the reciprocal of the one-period risk-free gross interest rate.
+
+* When  :math:`m_{t+1}` is covaries more negatively with the payout :math:`p_{t+1} + d_{t+1}`, the price of the asset is lower.  
+
 Equation :eq:`lteeqs102` asserts that the covariance of the stochastic discount factor with the one period payout :math:`d_{t+1} + p_{t+1}` is an important determinant of the price :math:`p_t`.
 
 We give examples of some models of stochastic discount factors that have been proposed later in this lecture and also in a `later lecture <https://python-advanced.quantecon.org/lucas_model.html>`__.
@@ -208,7 +212,9 @@ The answer to this question depends on
 
 #. the stochastic discount factor and how it correlates with dividends
 
-For now let's focus on the risk-neutral case, where the stochastic discount factor is constant, and study how prices depend on the dividend process.
+For now we'll study  the risk-neutral case in which  the stochastic discount factor is constant.
+
+We'll  focus on how the asset  prices depends on the dividend process.
 
 
 
@@ -306,10 +312,10 @@ where
 #. :math:`\{X_t\}` is a finite Markov chain with state space :math:`S` and
    transition probabilities
 
-.. math::
+    .. math::
 
-    P(x, y) := \mathbb P \{ X_{t+1} = y \,|\, X_t = x \}
-    \qquad (x, y \in S)
+        P(x, y) := \mathbb P \{ X_{t+1} = y \,|\, X_t = x \}
+        \qquad (x, y \in S)
 
 
 #. :math:`g` is a given function on :math:`S` taking positive values
@@ -320,8 +326,8 @@ You can think of
 * :math:`S` as :math:`n` possible "states of the world" and :math:`X_t` as the
   current state.
 
-* :math:`g` as a function that maps a given state :math:`X_t` into a growth
-  factor :math:`g_t = g(X_t)` for the endowment.
+* :math:`g` as a function that maps a given state :math:`X_t` into a growth of dividends
+  factor :math:`g_t = g(X_t)`.
 
 * :math:`\ln g_t = \ln (d_{t+1} / d_t)` is the growth rate of dividends.
 
@@ -476,7 +482,7 @@ The anticipation of high future dividend growth leads to a high price-dividend r
 
 
 
-Asset Prices under Risk Aversion
+Risk Aversion and Asset Prices 
 ================================
 
 
@@ -484,7 +490,7 @@ Now let's turn to the case where agents are risk averse.
 
 We'll price several distinct assets, including
 
-* The price of an endowment stream
+* An endowment stream
 
 * A consol (a type of bond issued by the UK government in the 19th century)
 
@@ -517,7 +523,7 @@ where :math:`u` is a concave utility function and :math:`c_t` is time :math:`t`
 
 (A derivation of this expression is given in a `later lecture <https://python-advanced.quantecon.org/lucas_model.html>`__)
 
-Assume the existence of an endowment that follows :eq:`mass_fmce`.
+Assume the existence of an endowment that follows growth process :eq:`mass_fmce`.
 
 The asset being priced is a claim on the endowment process.
 
@@ -819,7 +825,9 @@ The above is implemented in the function `consol_price`.
 Pricing an Option to Purchase the Consol
 ----------------------------------------
 
-Let's now price options of varying maturity that give the right to purchase a consol at a price :math:`p_S`.
+Let's now price options of varying maturities.
+
+We'll study an option that  gives the owner the  right to purchase a consol at a price :math:`p_S`.
 
 
 An Infinite Horizon Call Option
@@ -827,21 +835,21 @@ An Infinite Horizon Call Option
 
 We want to price an infinite horizon  option to purchase a consol at a price :math:`p_S`.
 
-The option entitles the owner at the beginning of a period either to
+The option entitles the owner at the beginning of a period either 
 
-#. purchase the bond at price :math:`p_S` now, or
+#. to purchase the bond at price :math:`p_S` now, or
 
-#. Not to exercise the option now but to retain the right to exercise it later
+#. not to exercise the option to purchase the asset now but to retain the right to exercise it later
 
 Thus, the owner either *exercises* the option now or chooses *not to exercise* and wait until next period.
 
 This is termed an infinite-horizon *call option* with *strike price* :math:`p_S`.
 
-The owner of the option is entitled to purchase the consol at the price :math:`p_S` at the beginning of any period, after the coupon has been paid to the previous owner of the bond.
+The owner of the option is entitled to purchase the consol at  price :math:`p_S` at the beginning of any period, after the coupon has been paid to the previous owner of the bond.
 
 The fundamentals of the economy are identical with the one above, including the stochastic discount factor and the process for consumption.
 
-Let :math:`w(X_t, p_S)` be the value of the option when the time :math:`t` growth state is known to be :math:`X_t` but *before* the owner has decided whether or not to exercise the option
+Let :math:`w(X_t, p_S)` be the value of the option when the time :math:`t` growth state is known to be :math:`X_t` but *before* the owner has decided whether to exercise the option
 at time :math:`t` (i.e., today).
 
 Recalling that :math:`p(X_t)` is the value of the consol when the initial growth state is :math:`X_t`, the value of the option satisfies
@@ -888,7 +896,7 @@ into vector :math:`Tw` via
     = \max \{ \beta M w,\; p - p_S {\mathbb 1} \}
 
 
-Start at some initial :math:`w` and iterate to convergence with :math:`T`.
+Start at some initial :math:`w` and iterate with :math:`T` to convergence .
 
 We can find the solution with the following function `call_option`
 
@@ -966,12 +974,12 @@ Here's a plot of :math:`w` compared to the consol price when :math:`P_S = 40`
 
 
 
-In large states, the value of the option is close to zero.
+In high values of the Markov growth state, the value of the option is close to zero.
 
-This is despite the fact the Markov chain is irreducible and low states ---
-where the consol prices are high --- will eventually be visited.
+This is despite the facts that the Markov chain is irreducible and that low states ---
+where the consol prices are high --- will be visited recurrently.
 
-The reason is that :math:`\beta=0.9`, so the future is discounted relatively rapidly.
+The reason for low valuations in high Markov growth states is that :math:`\beta=0.9`, so  future payoffs are  discounted substantially.
 
 
 
diff --git a/source/rst/markov_perf.rst b/source/rst/markov_perf.rst
index 0d43cac..4a1aef4 100644
--- a/source/rst/markov_perf.rst
+++ b/source/rst/markov_perf.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -146,7 +146,7 @@ The adjective "Markov" denotes that the equilibrium decision rules depend only o
 
 "Perfect" means complete, in the sense that the equilibrium is constructed by backward induction and hence builds in optimizing behavior for each firm at all possible future states.
 
-   * These include many states that will not be reached when we iterate forward on the pair of equilibrium strategies :math:`f_i` starting from a given initial state.
+* These include many states that will not be reached when we iterate forward on the pair of equilibrium strategies :math:`f_i` starting from a given initial state.
 
 
 
diff --git a/source/rst/mccall_correlated.rst b/source/rst/mccall_correlated.rst
index 81ced9b..2fd6844 100644
--- a/source/rst/mccall_correlated.rst
+++ b/source/rst/mccall_correlated.rst
@@ -17,7 +17,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 Overview
@@ -44,7 +44,8 @@ We will use the following imports:
     import quantecon as qe
     from interpolation import interp
     from numpy.random import randn
-    from numba import njit, jitclass, prange, float64
+    from numba import njit, prange, float64
+    from numba.experimental import jitclass
 
 
 The Model
@@ -329,7 +330,7 @@ reservation wage:
         js = JobSearch(c=c)
         f_star = compute_fixed_point(js, verbose=False)
         res_wage_function = np.exp(f_star * (1 - js.β))
-        ax.plot(js.z_grid, res_wage_function, label=f"$\\bar w$ at $c = {c}$")
+        ax.plot(js.z_grid, res_wage_function, label=rf"$\bar w$ at $c = {c}$")
 
     ax.set(xlabel="$z$", ylabel="wage")
     ax.legend()
@@ -455,7 +456,7 @@ Here is one solution.
 
     fig, ax = plt.subplots()
     ax.plot(beta_vals, durations)
-    ax.set_xlabel("$\\beta$")
+    ax.set_xlabel(r"$\beta$")
     ax.set_ylabel("mean unemployment duration")
     plt.show()
 
diff --git a/source/rst/mccall_fitted_vfi.rst b/source/rst/mccall_fitted_vfi.rst
index b65a790..65c5587 100644
--- a/source/rst/mccall_fitted_vfi.rst
+++ b/source/rst/mccall_fitted_vfi.rst
@@ -15,7 +15,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 
@@ -53,7 +53,8 @@ We will use the following imports:
     import quantecon as qe
     from interpolation import interp
     from numpy.random import randn
-    from numba import njit, jitclass, prange, float64, int32
+    from numba import njit, prange, float64, int32
+    from numba.experimental import jitclass
 
 
 
@@ -340,7 +341,7 @@ Exercise 1
 Use the code above to explore what happens to the reservation wage when the wage parameter :math:`\mu`
 changes.
 
-Use the default parameters and :math:`\mu` in ``mu_vals = np.linspace(0.0, 2.0, 15)``
+Use the default parameters and :math:`\mu` in ``mu_vals = np.linspace(0.0, 2.0, 15)``.
 
 Is the impact on the reservation wage as you expected?
 
@@ -360,7 +361,7 @@ support.
 
 Use ``s_vals = np.linspace(1.0, 2.0, 15)`` and ``m = 2.0``.
 
-State how you expect the reservation wage vary with :math:`s`.
+State how you expect the reservation wage to vary with :math:`s`.
 
 Now compute it.  Is this as you expected?
 
diff --git a/source/rst/mccall_model.rst b/source/rst/mccall_model.rst
index 5e4c514..6be9709 100644
--- a/source/rst/mccall_model.rst
+++ b/source/rst/mccall_model.rst
@@ -24,7 +24,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -51,7 +51,8 @@ Let's start with some imports:
 .. code-block:: ipython
 
     import numpy as np
-    from numba import jit, jitclass, float64
+    from numba import jit, float64
+    from numba.experimental import jitclass
     import matplotlib.pyplot as plt
     %matplotlib inline
     import quantecon as qe
@@ -306,7 +307,7 @@ Step 4: if the deviation is larger than some fixed tolerance, set :math:`v = v'`
 
 Step 5: return :math:`v`.
 
-Let :math:`\{ v_k \}` denote the sequence genererated by this algorithm.
+Let :math:`\{ v_k \}` denote the sequence generated by this algorithm.
 
 This sequence converges to the solution
 to :eq:`odu_pv2` as :math:`k \to \infty`, which is the value function :math:`v^*`.
@@ -333,7 +334,7 @@ itself via
 
 
 (A new vector :math:`Tv` is obtained from given vector :math:`v` by evaluating
-the r.h.s. at each :math:`i`)
+the r.h.s. at each :math:`i`.)
 
 The element :math:`v_k` in the sequence :math:`\{v_k\}` of successive
 approximations corresponds to :math:`T^k v`.
diff --git a/source/rst/mccall_model_with_separation.rst b/source/rst/mccall_model_with_separation.rst
index e0ddd01..91ddc85 100644
--- a/source/rst/mccall_model_with_separation.rst
+++ b/source/rst/mccall_model_with_separation.rst
@@ -19,7 +19,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 
@@ -49,7 +49,8 @@ We'll need the following imports
     import matplotlib.pyplot as plt
     %matplotlib inline
 
-    from numba import njit, jitclass, float64
+    from numba import njit, float64
+    from numba.experimental import jitclass
     from quantecon.distributions import BetaBinomial
 
 The Model
@@ -120,7 +121,7 @@ If he rejects, then he receives unemployment compensation :math:`c`.
 
 The process then repeats.
 
-(Note: we do not allow for job search while employed---this topic is taken up in a :doc:`later lecture <jv>`)
+(Note: we do not allow for job search while employed---this topic is taken up in a :doc:`later lecture <jv>`.)
 
 
 
diff --git a/source/rst/mle.rst b/source/rst/mle.rst
index 0cc6a71..fc581e7 100644
--- a/source/rst/mle.rst
+++ b/source/rst/mle.rst
@@ -949,10 +949,10 @@ The Hessian of the Probit model is
 
 
    \frac {\partial^2 \log \mathcal{L}} {\partial \boldsymbol{\beta} \partial \boldsymbol{\beta}'} =
-   - \sum_{i=1}^n \phi (\mathbf{x}_i' \boldsymbol{\beta})
+   -\sum_{i=1}^n \phi (\mathbf{x}_i' \boldsymbol{\beta})
    \Big[
    y_i \frac{ \phi (\mathbf{x}_i' \boldsymbol{\beta}) + \mathbf{x}_i' \boldsymbol{\beta} \Phi (\mathbf{x}_i' \boldsymbol{\beta}) } { [\Phi (\mathbf{x}_i' \boldsymbol{\beta})]^2 } +
-   (1 - y_i) \frac{ \phi_i (\mathbf{x}_i' \boldsymbol{\beta}) - \mathbf{x}_i' \boldsymbol{\beta} (1 - \Phi (\mathbf{x}_i' \boldsymbol{\beta})) } { [1 - \Phi (\mathbf{x}_i' \boldsymbol{\beta})]^2 }
+   (1 - y_i) \frac{ \phi (\mathbf{x}_i' \boldsymbol{\beta}) - \mathbf{x}_i' \boldsymbol{\beta} (1 - \Phi (\mathbf{x}_i' \boldsymbol{\beta})) } { [1 - \Phi (\mathbf{x}_i' \boldsymbol{\beta})]^2 }
    \Big]
    \mathbf{x}_i \mathbf{x}_i'
 
diff --git a/source/rst/multi_hyper.rst b/source/rst/multi_hyper.rst
index acc34a8..45176d3 100644
--- a/source/rst/multi_hyper.rst
+++ b/source/rst/multi_hyper.rst
@@ -64,12 +64,13 @@ Things have to add up so :math:`\sum_{i=1}^c k_i = n`.
 Under the hypothesis that the selection process judges proposals on their quality and that quality is independent of continent of the author's continent of residence, the administrator views the outcome of the selection procedure as a random vector 
 
 .. math::
+
     X = \begin{pmatrix} k_1 \cr k_2 \cr \vdots \cr k_c \end{pmatrix}.
 
 To evaluate whether the selection procedure is **color blind** the administrator wants to  study whether the particular realization of :math:`X` drawn can plausibly
 be said to be a random draw from the probability distribution that is implied by the **color blind** hypothesis.  
 
-The appropriate probability distribution is the one described `here <https://en.wikipedia.org/wiki/Hypergeometric_distribution>`__
+The appropriate probability distribution is the one described `here <https://en.wikipedia.org/wiki/Hypergeometric_distribution>`__.
 
 Let's now instantiate the administrator's problem, while continuing to use the colored balls metaphor.
 
@@ -87,6 +88,7 @@ So :math:`n = 15`.
 The administrator wants to know the probability distribution of outcomes 
 
 .. math::
+
     X = \begin{pmatrix} k_1 \cr k_2 \cr \vdots \cr k_4 \end{pmatrix}.
 
 In particular, he wants to know whether a particular
@@ -145,14 +147,12 @@ The multivariate hypergeometric distribution has the following properties:
 
 .. math::
 
-
    {\displaystyle \operatorname {E} (X_{i})=n{\frac {K_{i}}{N}}}
 
 **Variances and covariances**:
 
 .. math::
 
-
    {\displaystyle \operatorname {Var} (X_{i})=n{\frac {N-n}{N-1}}\;{\frac {K_{i}}{N}}\left(1-{\frac {K_{i}}{N}}\right)}
 
 .. math::
@@ -271,7 +271,6 @@ two of each color are chosen is
 
 .. math::
 
-
    P(2{\text{ black}},2{\text{ white}},2{\text{ red}})={{{5 \choose 2}{10 \choose 2}{15 \choose 2}} \over {30 \choose 6}}=0.079575596816976
 
 .. code-block:: python3
@@ -456,9 +455,9 @@ Note the substantial differences between hypergeometric distribution and the app
 The off-diagonal graphs plot the empirical joint distribution of
 :math:`k_i` and :math:`k_j` for each pair :math:`(i, j)`.
 
- The darker the blue, the more data points are contained in the corresponding cell.  (Note that :math:`k_i` is on the x-axis and :math:`k_j` is on the y-axis).
+The darker the blue, the more data points are contained in the corresponding cell.  (Note that :math:`k_i` is on the x-axis and :math:`k_j` is on the y-axis).
 
- The contour maps plot the bivariate Gaussian density function of :math:`\left(k_i, k_j\right)` with the population mean and covariance given by slices of :math:`\mu` and :math:`\Sigma` that we computed above.
+The contour maps plot the bivariate Gaussian density function of :math:`\left(k_i, k_j\right)` with the population mean and covariance given by slices of :math:`\mu` and :math:`\Sigma` that we computed above.
 
 Let's also test the normality for each :math:`k_i` using ``scipy.stats.normaltest`` that implements D’Agostino and Pearson's 
 test that combines skew and kurtosis to form an omnibus test of normality. 
diff --git a/source/rst/multivariate_normal.rst b/source/rst/multivariate_normal.rst
new file mode 100644
index 0000000..7fbbd81
--- /dev/null
+++ b/source/rst/multivariate_normal.rst
@@ -0,0 +1,2125 @@
+.. _multivariate_normal_v11:
+
+.. include:: /_static/includes/header.raw
+
+.. highlight:: python3
+
+***********************************
+Multivariate Normal Distribution
+***********************************
+
+.. contents:: :depth: 2
+
+
+    
+
+
+Overview
+=========  
+
+
+This lecture describes a workhorse in probability theory, statistics, and economics, namely,
+the **multivariate normal distribution**.
+
+In this lecture, you will learn formulas for 
+
+* the joint distribution of a random vector :math:`x` of length :math:`N`
+
+* marginal distributions for all subvectors of :math:`x`
+
+* conditional distributions for subvectors of :math:`x` conditional on other subvectors of :math:`x`
+
+We will use  the multivariate normal distribution to formulate some classic models:
+
+* a **factor analytic model** of an intelligence quotient, i.e., IQ
+
+* a **factor analytic model** of two independent inherent abilities, mathematical and verbal.
+
+* a more general factor analytic model
+
+* PCA as an approximation to a factor analytic model
+
+* time series generated by linear stochastic difference equations
+
+* optimal linear filtering theory
+
+
+The Multivariate Normal Distribution
+====================================
+
+
+
+This lecture defines a Python class ``MultivariateNormal`` to be used
+to generate **marginal** and **conditional** distributions associated
+with a multivariate normal distribution.
+
+For a multivariate normal distribution it is very convenient that
+
+-  conditional expectations equal linear least squares projections
+
+-  conditional distributions are characterized by multivariate linear
+   regressions
+
+We apply our Python class to some classic examples.
+
+We will use the following imports:
+
+.. code-block:: ipython
+
+    import numpy as np
+    from numba import njit
+    import statsmodels.api as sm
+    import matplotlib.pyplot as plt
+    %matplotlib inline
+
+Assume that an :math:`N \times 1` random vector :math:`z` has a
+multivariate normal probability density.
+
+This means that the probability density takes the form
+
+.. math::
+
+
+   f\left(z;\mu,\Sigma\right)=\left(2\pi\right)^{-\left(\frac{N}{2}\right)}\det\left(\Sigma\right)^{-\frac{1}{2}}\exp\left(-.5\left(z-\mu\right)^{\prime}\Sigma^{-1}\left(z-\mu\right)\right)
+
+where :math:`\mu=Ez` is the mean of the random vector :math:`z` and
+:math:`\Sigma=E\left(z-\mu\right)\left(z-\mu\right)^\prime` is the
+covariance matrix of :math:`z`.
+
+.. code-block:: python3
+
+    @njit
+    def f(z, μ, Σ):
+        """
+        The density function of multivariate normal distribution.
+        
+        Parameters
+        ---------------
+        z: ndarray(float, dim=2)
+            random vector, N by 1
+        μ: ndarray(float, dim=1 or 2)
+            the mean of z, N by 1
+        Σ: ndarray(float, dim=2)
+            the covarianece matrix of z, N by 1
+        """
+    
+        z = np.atleast_2d(z)
+        μ = np.atleast_2d(μ)
+        Σ = np.atleast_2d(Σ)
+    
+        N = z.size
+        
+        temp1 = np.linalg.det(Σ) ** (-1/2)
+        temp2 = np.exp(-.5 * (z - μ).T @ np.linalg.inv(Σ) @ (z - μ))
+    
+        return (2 * np.pi) ** (-N/2) * temp1 * temp2
+
+For some integer :math:`k\in \{2,\dots, N-1\}`, partition
+:math:`z` as
+:math:`z=\left[\begin{array}{c} z_{1}\\ z_{2} \end{array}\right]`, where
+:math:`z_1` is an :math:`\left(N-k\right)\times1` vector and :math:`z_2`
+is a :math:`k\times1` vector.
+
+Let
+
+.. math::
+
+
+   \mu=\left[\begin{array}{c}
+   \mu_{1}\\
+   \mu_{2}
+   \end{array}\right],\quad\Sigma=\left[\begin{array}{cc}
+   \Sigma_{11} & \Sigma_{12}\\
+   \Sigma_{21} & \Sigma_{22}
+   \end{array}\right]
+
+be corresponding partitions of :math:`\mu` and :math:`\Sigma`.
+
+The **marginal** distribution of :math:`z_1` is
+
+-  multivariate normal with mean :math:`\mu_1` and covariance matrix
+   :math:`\Sigma_{11}`.
+
+The **marginal** distribution of :math:`z_2` is
+
+-  multivariate normal with mean :math:`\mu_2` and covariance matrix
+   :math:`\Sigma_{22}`.
+
+The distribution of :math:`z_1` **conditional** on :math:`z_2` is
+
+-  multivariate normal with mean
+
+.. math::
+
+
+   \hat{\mu}_1 = \mu_1 + \beta \left(z_2 -\mu_2\right)
+
+and covariance matrix
+
+.. math::
+
+
+   \hat{\Sigma}_{11}=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{21}=\Sigma_{11}-\beta\Sigma_{22}\beta^{\prime}
+
+where
+
+.. math::
+
+
+   \beta = \Sigma_{12}\Sigma_{22}^{-1}
+
+is an :math:`\left(N-k\right) \times k` matrix of **population
+regression coefficients** of :math:`z_1 - \mu_1` on :math:`z_2 - \mu_2`.
+
+The following class constructs a multivariate normal distribution
+instance with two methods.
+
+-  a method ``partition`` computes :math:`\beta`, taking :math:`k` as an
+   input
+
+-  a method ``cond_dist`` computes either the distribution of
+   :math:`z_1` conditional on :math:`z_2` or the distribution of
+   :math:`z_2` conditional on :math:`z_1`
+
+.. code-block:: python3
+
+    class MultivariateNormal:
+        """
+        Class of multivariate normal distribution.
+    
+        Parameters
+        ----------
+        μ: ndarray(float, dim=1)
+            the mean of z, N by 1
+        Σ: ndarray(float, dim=2)
+            the covarianece matrix of z, N by 1
+    
+        Arguments
+        ---------
+        μ, Σ:
+            see parameters
+        μs: list(ndarray(float, dim=1))
+            list of mean vectors μ1 and μ2 in order
+        Σs: list(list(ndarray(float, dim=2)))
+            2 dimensional list of covariance matrices
+            Σ11, Σ12, Σ21, Σ22 in order
+        βs: list(ndarray(float, dim=1))
+            list of regression coefficients β1 and β2 in order
+        """
+    
+        def __init__(self, μ, Σ):
+            "initialization"
+            self.μ = np.array(μ)
+            self.Σ = np.atleast_2d(Σ)
+    
+        def partition(self, k):
+            """
+            Given k, partition the random vector z into a size k vector z1
+            and a size N-k vector z2. Partition the mean vector μ into
+            μ1 and μ2, and the covariance matrix Σ into Σ11, Σ12, Σ21, Σ22
+            correspondingly. Compute the regression coefficients β1 and β2
+            using the partitioned arrays.
+            """
+            μ = self.μ
+            Σ = self.Σ
+    
+            self.μs = [μ[:k], μ[k:]]
+            self.Σs = [[Σ[:k, :k], Σ[:k, k:]],
+                       [Σ[k:, :k], Σ[k:, k:]]]
+    
+            self.βs = [self.Σs[0][1] @ np.linalg.inv(self.Σs[1][1]),
+                       self.Σs[1][0] @ np.linalg.inv(self.Σs[0][0])]
+    
+        def cond_dist(self, ind, z):
+            """
+            Compute the conditional distribution of z1 given z2, or reversely.
+            Argument ind determines whether we compute the conditional
+            distribution of z1 (ind=0) or z2 (ind=1).
+    
+            Returns
+            ---------
+            μ_hat: ndarray(float, ndim=1)
+                The conditional mean of z1 or z2.
+            Σ_hat: ndarray(float, ndim=2)
+                The conditional covariance matrix of z1 or z2.
+            """
+            β = self.βs[ind]
+            μs = self.μs
+            Σs = self.Σs
+    
+            μ_hat = μs[ind] + β @ (z - μs[1-ind])
+            Σ_hat = Σs[ind][ind] - β @ Σs[1-ind][1-ind] @ β.T
+    
+            return μ_hat, Σ_hat
+
+Let’s put this code to work on a suite of examples.
+
+We begin with a simple bivariate example; after that we’ll turn to a
+trivariate example.
+
+We’ll compute population moments of some conditional distributions using
+our ``MultivariateNormal`` class.
+
+Then for fun we’ll compute sample analogs of the associated population
+regressions by generating simulations and then computing linear least
+squares regressions.
+
+We’ll compare those linear least squares regressions for the simulated
+data to their population counterparts.
+
+Bivariate Example
+=================
+
+We start with a bivariate normal distribution pinned down by
+
+.. math::
+
+
+   \mu=\left[\begin{array}{c}
+   0\\
+   0
+   \end{array}\right],\quad\Sigma=\left[\begin{array}{cc}
+   1 & .2\\
+   .2 & 1
+   \end{array}\right]
+
+.. code-block:: python3
+
+    μ = np.array([0., 0.])
+    Σ = np.array([[1., .2], [.2 ,1.]])
+    
+    # construction of the multivariate normal instance
+    multi_normal = MultivariateNormal(μ, Σ)
+
+.. code-block:: python3
+
+    k = 1 # choose partition
+    
+    # partition and compute regression coefficients
+    multi_normal.partition(k)
+    multi_normal.βs[0]
+
+Let’s compute the mean and variance of the distribution of :math:`z_1`
+conditional on :math:`z_2=5`.
+
+.. code-block:: python3
+
+    # compute the cond. dist. of z1
+    ind = 0
+    z2 = np.array([5.]) # given z2
+    
+    μ1_hat, Σ1_hat = multi_normal.cond_dist(ind, z2)
+    print('μ1_hat, Σ1_hat = ', μ1_hat, Σ1_hat)
+
+Let’s compare the preceding population mean and variance with outcomes
+from drawing a large sample and then regressing :math:`z_1 - \mu_1` on
+:math:`z_2 - \mu_2`.
+
+We know that
+
+.. math::
+
+
+   E z_1 | z_2 = \left(\mu_1 - \beta \mu_2 \right) + \beta z_2 
+
+which can be arranged to
+
+.. math::
+
+
+   z_1 - \mu_1 = \beta \left( z_2 - \mu_2 \right) + \epsilon,
+
+We anticipate that for larger and larger sample sizes, estimated OLS
+coefficients will converge to :math:`\beta` and the estimated variance
+of :math:`\epsilon` will converge to :math:`\hat{\Sigma}_1`.
+
+.. code-block:: python3
+
+    n = 1_000_000 # sample size
+    
+    # simulate multivariate normal random vectors
+    data = np.random.multivariate_normal(μ, Σ, size=n)
+    z1_data = data[:, 0]
+    z2_data = data[:, 1]
+    
+    # OLS regression
+    μ1, μ2 = multi_normal.μs
+    results = sm.OLS(z1_data - μ1, z2_data - μ2).fit()
+
+Let’s compare the preceding population :math:`\beta` with the OLS sample
+estimate on :math:`z_2 - \mu_2`
+
+.. code-block:: python3
+
+    multi_normal.βs[0], results.params
+
+Let’s compare our population :math:`\hat{\Sigma}_1` with the
+degrees-of-freedom adjusted estimate of the variance of :math:`\epsilon`
+
+.. code-block:: python3
+
+    Σ1_hat, results.resid @ results.resid.T / (n - 1)
+
+Lastly, let’s compute the estimate of :math:`\hat{E z_1 | z_2}` and
+compare it with :math:`\hat{\mu}_1`
+
+.. code-block:: python3
+
+    μ1_hat, results.predict(z2 - μ2) + μ1
+
+Thus, in each case, for our very large sample size, the sample analogues
+closely approximate their population counterparts.
+
+These close approximations are foretold by a version of a Law of Large
+Numbers.
+
+Trivariate Example
+==================
+
+Let’s apply our code to a trivariate example.
+
+We’ll specify the mean vector and the covariance matrix as follows.
+
+.. code-block:: python3
+
+    μ = np.random.random(3)
+    C = np.random.random((3, 3))
+    Σ = C @ C.T # positive semi-definite
+    
+    multi_normal = MultivariateNormal(μ, Σ)
+
+.. code-block:: python3
+
+    μ, Σ
+
+.. code-block:: python3
+
+    k = 1
+    multi_normal.partition(k)
+
+Let’s compute the distribution of :math:`z_1` conditional on
+:math:`z_{2}=\left[\begin{array}{c} 2\\ 5 \end{array}\right]`.
+
+.. code-block:: python3
+
+    ind = 0
+    z2 = np.array([2., 5.])
+    
+    μ1_hat, Σ1_hat = multi_normal.cond_dist(ind, z2)
+
+.. code-block:: python3
+
+    n = 1_000_000
+    data = np.random.multivariate_normal(μ, Σ, size=n)
+    z1_data = data[:, :k]
+    z2_data = data[:, k:]
+
+.. code-block:: python3
+
+    μ1, μ2 = multi_normal.μs
+    results = sm.OLS(z1_data - μ1, z2_data - μ2).fit()
+
+As above, we compare population and sample regression coefficients, the
+conditional covariance matrix, and the conditional mean vector in that
+order.
+
+.. code-block:: python3
+
+    multi_normal.βs[0], results.params
+
+.. code-block:: python3
+
+    Σ1_hat, results.resid @ results.resid.T / (n - 1)
+
+.. code-block:: python3
+
+    μ1_hat, results.predict(z2 - μ2) + μ1
+
+Once again, sample analogues do a good job of approximating their
+populations counterparts.
+
+One Dimensional Intelligence (IQ)
+=================================
+
+Let’s move closer to a real-life example, namely, inferring a
+one-dimensional measure of intelligence called IQ from a list of test
+scores.
+
+The :math:`i`\ th test score :math:`y_i` equals the sum of an unknown
+scalar IQ :math:`\theta` and a random variable :math:`w_{i}`.
+
+.. math::
+
+
+   y_{i} = \theta + \sigma_y w_i, \quad i=1,\dots, n
+
+The distribution of IQ’s for a cross-section of people is a normal
+random variable described by
+
+.. math::
+
+
+   \theta = \mu_{\theta} + \sigma_{\theta} w_{n+1}.
+
+We assume the noise in the test scores is IID and not correlated with
+IQ.
+
+In particular, we assume :math:`\{w_i\}_{i=1}^{n+1}` are i.i.d. standard
+normal:
+
+.. math::
+
+
+   \boldsymbol{w}=
+   \left[\begin{array}{c}
+   w_{1}\\
+   w_{2}\\
+   \vdots\\
+   w_{n}\\
+   w_{n+1}
+   \end{array}\right]\sim N\left(0,I_{n+1}\right)
+
+The following system describes the random vector :math:`X` that
+interests us:
+
+.. math::
+
+
+   X=\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   \vdots\\
+   y_{n}\\
+   \theta
+   \end{array}\right]=\left[\begin{array}{c}
+   \mu_{\theta}\\
+   \mu_{\theta}\\
+   \vdots\\
+   \mu_{\theta}\\
+   \mu_{\theta}
+   \end{array}\right]+\left[\begin{array}{ccccc}
+   \sigma_{y} & 0 & \cdots & 0 & \sigma_{\theta}\\
+   0 & \sigma_{y} & \cdots & 0 & \sigma_{\theta}\\
+   \vdots & \vdots & \ddots & \vdots & \vdots\\
+   0 & 0 & \cdots & \sigma_{y} & \sigma_{\theta}\\
+   0 & 0 & \cdots & 0 & \sigma_{\theta}
+   \end{array}\right]\left[\begin{array}{c}
+   w_{1}\\
+   w_{2}\\
+   \vdots\\
+   w_{n}\\
+   w_{n+1}
+   \end{array}\right],
+
+or equivalently,
+
+.. math::
+
+
+   X=\mu_{\theta}\boldsymbol{1}_{n+1}+D\boldsymbol{w}
+
+where :math:`X = \begin{bmatrix} y \cr \theta \end{bmatrix}`,
+:math:`\boldsymbol{1}_{n+1}` is a vector of :math:`1`\ s of size
+:math:`n+1`, and :math:`D` is an :math:`n+1` by :math:`n+1` matrix.
+
+Let’s define a Python function that constructs the mean :math:`\mu` and
+covariance matrix :math:`\Sigma` of the random vector :math:`X` that we
+know is governed by a multivariate normal distribution.
+
+As arguments, the function takes the number of tests :math:`n`, the mean
+:math:`\mu_{\theta}` and the standard deviation :math:`\sigma_\theta` of
+the IQ distribution, and the standard deviation of the randomness in
+test scores :math:`\sigma_{y}`.
+
+.. code-block:: python3
+
+    def construct_moments_IQ(n, μθ, σθ, σy):
+    
+        μ_IQ = np.ones(n+1) * μθ
+    
+        D_IQ = np.zeros((n+1, n+1))
+        D_IQ[range(n), range(n)] = σy
+        D_IQ[:, n] = σθ
+    
+        Σ_IQ = D_IQ @ D_IQ.T
+    
+        return μ_IQ, Σ_IQ, D_IQ
+
+Now let’s consider a specific instance of this model.
+
+Assume we have recorded :math:`50` test scores and we know that
+:math:`\mu_{\theta}=100`, :math:`\sigma_{\theta}=10`, and
+:math:`\sigma_{y}=10`.
+
+We can compute the mean vector and covariance matrix of :math:`x` easily
+with our ``construct_moments_IQ`` function as follows.
+
+.. code-block:: python3
+
+    n = 50
+    μθ, σθ, σy = 100., 10., 10.
+    
+    μ_IQ, Σ_IQ, D_IQ = construct_moments_IQ(n, μθ, σθ, σy)
+    μ_IQ, Σ_IQ, D_IQ
+
+We can now use our ``MultivariateNormal`` class to construct an
+instance, then partition the mean vector and covariance matrix as we
+wish.
+
+We choose ``k=n`` so that :math:`z_{1} = y` and :math:`z_{2} = \theta`.
+
+.. code-block:: python3
+
+    multi_normal_IQ = MultivariateNormal(μ_IQ, Σ_IQ)
+    
+    k = n
+    multi_normal_IQ.partition(k)
+
+Using the generator ``multivariate_normal``, we can make one draw of the
+random vector from our distribution and then compute the distribution of
+:math:`\theta` conditional on our test scores.
+
+Let’s do that and then print out some pertinent quantities.
+
+.. code-block:: python3
+
+    x = np.random.multivariate_normal(μ_IQ, Σ_IQ)
+    y = x[:-1] # test scores
+    θ = x[-1]  # IQ
+
+.. code-block:: python3
+
+    # the true value
+    θ
+
+The method ``cond_dist`` takes test scores as input and returns the
+conditional normal distribution of the IQ :math:`\theta`.
+
+Note that now :math:`\theta` is what we denoted as :math:`z_{2}` in the
+general case so we need to set ``ind=1``.
+
+.. code-block:: python3
+
+    ind = 1
+    multi_normal_IQ.cond_dist(ind, y)
+
+The first number is the conditional mean :math:`\hat{\mu}_{\theta}` and
+the second is the conditional variance :math:`\hat{\Sigma}_{\theta}`.
+
+How do the additional test scores affect our inferences?
+
+To shed light on this, we compute a sequence of conditional
+distributions of :math:`\theta` by varying the number of test scores in
+the conditioning set from :math:`1` to :math:`n`.
+
+We’ll make a pretty graph showing how our judgment of the person’s IQ
+change as more test results come in.
+
+.. code-block:: python3
+
+    # array for containing moments
+    μθ_hat_arr = np.empty(n)
+    Σθ_hat_arr = np.empty(n)
+    
+    # loop over number of test scores
+    for i in range(1, n+1):
+        # construction of multivariate normal distribution instance
+        μ_IQ_i, Σ_IQ_i, D_IQ_i = construct_moments_IQ(i, μθ, σθ, σy)
+        multi_normal_IQ_i = MultivariateNormal(μ_IQ_i, Σ_IQ_i)
+    
+        # partition and compute conditional distribution
+        multi_normal_IQ_i.partition(i)
+        scores_i = y[:i]
+        μθ_hat_i, Σθ_hat_i = multi_normal_IQ_i.cond_dist(1, scores_i)
+    
+        # store the results
+        μθ_hat_arr[i-1] = μθ_hat_i[0]
+        Σθ_hat_arr[i-1] = Σθ_hat_i[0, 0]
+    
+    # transform variance to standard deviation
+    σθ_hat_arr = np.sqrt(Σθ_hat_arr)
+
+.. code-block:: python3
+
+    μθ_hat_lower = μθ_hat_arr - 1.96 * σθ_hat_arr
+    μθ_hat_higher = μθ_hat_arr + 1.96 * σθ_hat_arr
+    
+    plt.hlines(θ, 1, n+1, ls='--', label='true $θ$')
+    plt.plot(range(1, n+1), μθ_hat_arr, color='b', label='$\hat{μ}_{θ}$')
+    plt.plot(range(1, n+1), μθ_hat_lower, color='b', ls='--')
+    plt.plot(range(1, n+1), μθ_hat_higher, color='b', ls='--')
+    plt.fill_between(range(1, n+1), μθ_hat_lower, μθ_hat_higher,
+                     color='b', alpha=0.2, label='95%')
+    
+    plt.xlabel('number of test scores')
+    plt.ylabel('$\hat{θ}$')
+    plt.legend()
+    
+    plt.show()
+
+The solid blue line in the plot above shows :math:`\hat{\mu}_{\theta}`
+as a function of the number of test scores that we have recorded and
+conditioned on.
+
+The blue area shows the span that comes from adding or deducing
+:math:`1.96 \hat{\sigma}_{\theta}` from :math:`\hat{\mu}_{\theta}`.
+
+Therefore, :math:`95\%` of the probability mass of the conditional
+distribution falls in this range.
+
+The value of the random :math:`\theta` that we drew is shown by the
+black dotted line.
+
+As more and more test scores come in, our estimate of the person’s
+:math:`\theta` become more and more reliable.
+
+By staring at the changes in the conditional distributions, we see that
+adding more test scores makes :math:`\hat{\theta}` settle down and
+approach :math:`\theta`.
+
+Thus, each :math:`y_{i}` adds information about :math:`\theta`.
+
+If we drove the number of tests :math:`n \rightarrow + \infty`, the
+conditional standard deviation :math:`\hat{\sigma}_{\theta}` would
+converge to :math:`0` at the rate :math:`\frac{1}{n^{.5}}`.
+
+Another representation
+======================
+
+By using a different representation, let’s look at things from a
+different perspective.
+
+We can represent the random vector :math:`X` defined above as
+
+.. math::
+
+
+   X = \mu_{\theta} \boldsymbol{1}_{n+1} + C \epsilon, \quad \epsilon \sim N\left(0, I\right)
+
+where :math:`C` is a lower triangular **Cholesky factor** of
+:math:`\Sigma` so that
+
+.. math:: \Sigma \equiv DD^{\prime} = C C^\prime
+
+and
+
+.. math::  E \epsilon \epsilon' = I .
+
+It follows that
+
+.. math::  \epsilon \sim N(0, I) . 
+
+Let :math:`G=C^{-1}`; :math:`G` is also lower triangular.
+
+We can compute :math:`\epsilon` from the formula
+
+.. math::
+
+
+   \epsilon = G \left( X - \mu_{\theta} \boldsymbol{1}_{n+1} \right)
+
+This formula confirms that the orthonormal vector :math:`\epsilon`
+contains the same information as the non-orthogonal vector
+:math:`\left( X - \mu_{\theta} \boldsymbol{1}_{n+1} \right)`.
+
+We can say that :math:`\epsilon` is an orthogonal basis for
+:math:`\left( X - \mu_{\theta} \boldsymbol{1}_{n+1} \right)`.
+
+Let :math:`c_{i}` be the :math:`i`\ th element in the last row of
+:math:`C`.
+
+Then we can write
+
+.. math::
+    :label: mnv_1
+
+
+     \theta = \mu_{\theta} + c_1 \epsilon_1 + c_2 \epsilon_2 + \dots + c_n \epsilon_n + c_{n+1} \epsilon_{n+1} 
+
+
+
+The mutual orthogonality of the :math:`\epsilon_i`\ ’s provides us with an
+informative way to interpret them in light of equation :eq:`mnv_1`.
+
+Thus, relative to what is known from tests :math:`i=1, \ldots, n-1`,
+:math:`c_i \epsilon_i` is the amount of **new information** about
+:math:`\theta` brought by the test number :math:`i`.
+
+Here **new information** means **surprise** or what could not be
+predicted from earlier information.
+
+Formula :eq:`mnv_1` also provides us with an enlightening way to express
+conditional means and conditional variances that we computed earlier.
+
+In particular,
+
+.. math::
+
+
+   E\left[\theta \mid y_1, \dots, y_k\right] = \mu_{\theta} + c_1 \epsilon_1 + \dots + c_k \epsilon_k
+
+and
+
+.. math::
+
+
+   Var\left(\theta \mid y_1, \dots, y_k\right) = c^2_{k+1} + c^2_{k+2} + \dots + c^2_{n+1}.
+
+.. code-block:: python3
+
+    C = np.linalg.cholesky(Σ_IQ)
+    G = np.linalg.inv(C)
+    
+    ε = G @ (x - μθ)
+
+.. code-block:: python3
+
+    cε = C[n, :] * ε
+    
+    # compute the sequence of μθ and Σθ conditional on y1, y2, ..., yk
+    μθ_hat_arr_C = np.array([np.sum(cε[:k+1]) for k in range(n)]) + μθ
+    Σθ_hat_arr_C = np.array([np.sum(C[n, i+1:n+1] ** 2) for i in range(n)])
+
+To confirm that these formulas give the same answers that we computed
+earlier, we can compare the means and variances of :math:`\theta`
+conditional on :math:`\{y_i\}_{i=1}^k` with what we obtained above using
+the formulas implemented in the class ``MultivariateNormal`` built on
+our original representation of conditional distributions for
+multivariate normal distributions.
+
+.. code-block:: python3
+
+    # conditional mean
+    np.max(np.abs(μθ_hat_arr - μθ_hat_arr_C)) < 1e-10
+
+.. code-block:: python3
+
+    # conditional variance
+    np.max(np.abs(Σθ_hat_arr - Σθ_hat_arr_C)) < 1e-10
+
+Magic of the Cholesky factorization
+===================================
+
+Evidently, the Cholesky factorization is automatically computing the
+population  **regression coefficients** and associated statistics
+that are produced by our ``MultivariateNormal`` class.
+
+The Cholesky factorization is  computing things **recursively**.
+
+Indeed, in formula :eq:`mnv_1`,
+
+-  the random variable :math:`c_i \epsilon_i` is information about
+   :math:`\theta` that is not contained by the information in
+   :math:`\epsilon_1, \epsilon_2, \ldots, \epsilon_{i-1}`
+
+-  the coefficient :math:`c_i` is the simple population regression
+   coefficient of :math:`\theta - \mu_\theta` on :math:`\epsilon_i`
+
+Math and Verbal Components of Intelligence 
+===========================================
+
+We can alter the preceding example to be more realistic.
+
+There is ample evidence that IQ is not a scalar.
+
+Some people are good in math skills but poor in language skills.
+
+Other people are good in language skills but poor in math skills.
+
+So now we shall assume that there are two dimensions of IQ,
+:math:`\theta` and :math:`\eta`.
+
+These determine average performances in math and language tests,
+respectively.
+
+We observe math scores :math:`\{y_i\}_{i=1}^{n}` and language scores
+:math:`\{y_i\}_{i=n+1}^{2n}`.
+
+When :math:`n=2`, we assume that outcomes are draws from a multivariate
+normal distribution with representation
+
+.. math::
+
+
+   X=\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   y_{3}\\
+   y_{4}\\
+   \theta\\
+   \eta
+   \end{array}\right]=\left[\begin{array}{c}
+   \mu_{\theta}\\
+   \mu_{\theta}\\
+   \mu_{\eta}\\
+   \mu_{\eta}\\
+   \mu_{\theta}\\
+   \mu_{\eta}
+   \end{array}\right]+\left[\begin{array}{cccccc}
+   \sigma_{y} & 0 & 0 & 0 & \sigma_{\theta} & 0\\
+   0 & \sigma_{y} & 0 & 0 & \sigma_{\theta} & 0\\
+   0 & 0 & \sigma_{y} & 0 & 0 & \sigma_{\eta}\\
+   0 & 0 & 0 & \sigma_{y} & 0 & \sigma_{\eta}\\
+   0 & 0 & 0 & 0 & \sigma_{\theta} & 0\\
+   0 & 0 & 0 & 0 & 0 & \sigma_{\eta}
+   \end{array}\right]\left[\begin{array}{c}
+   w_{1}\\
+   w_{2}\\
+   w_{3}\\
+   w_{4}\\
+   w_{5}\\
+   w_{6}
+   \end{array}\right]
+
+where
+:math:`w \begin{bmatrix} w_1 \cr w_2 \cr \vdots \cr w_6 \end{bmatrix}`
+is a standard normal random vector.
+
+We construct a Python function ``construct_moments_IQ2d`` to construct
+the mean vector and covariance matrix of the joint normal distribution.
+
+.. code-block:: python3
+
+    def construct_moments_IQ2d(n, μθ, σθ, μη, ση, σy):
+    
+        μ_IQ2d = np.empty(2*(n+1))
+        μ_IQ2d[:n] = μθ
+        μ_IQ2d[2*n] = μθ
+        μ_IQ2d[n:2*n] = μη
+        μ_IQ2d[2*n+1] = μη
+        
+    
+        D_IQ2d = np.zeros((2*(n+1), 2*(n+1)))
+        D_IQ2d[range(2*n), range(2*n)] = σy
+        D_IQ2d[:n, 2*n] = σθ
+        D_IQ2d[2*n, 2*n] = σθ
+        D_IQ2d[n:2*n, 2*n+1] = ση
+        D_IQ2d[2*n+1, 2*n+1] = ση
+    
+        Σ_IQ2d = D_IQ2d @ D_IQ2d.T
+    
+        return μ_IQ2d, Σ_IQ2d, D_IQ2d
+
+Let’s put the function to work.
+
+.. code-block:: python3
+
+    n = 2
+    # mean and variance of θ, η, and y
+    μθ, σθ, μη, ση, σy = 100., 10., 100., 10, 10
+    
+    μ_IQ2d, Σ_IQ2d, D_IQ2d = construct_moments_IQ2d(n, μθ, σθ, μη, ση, σy)
+    μ_IQ2d, Σ_IQ2d, D_IQ2d
+
+.. code-block:: python3
+
+    # take one draw
+    x = np.random.multivariate_normal(μ_IQ2d, Σ_IQ2d)
+    y1 = x[:n]
+    y2 = x[n:2*n]
+    θ = x[2*n]
+    η = x[2*n+1]
+    
+    # the true values
+    θ, η
+
+We first compute the joint normal distribution of
+:math:`\left(\theta, \eta\right)`.
+
+.. code-block:: python3
+
+    multi_normal_IQ2d = MultivariateNormal(μ_IQ2d, Σ_IQ2d)
+    
+    k = 2*n # the length of data vector
+    multi_normal_IQ2d.partition(k)
+    
+    multi_normal_IQ2d.cond_dist(1, [*y1, *y2])
+
+Now let’s compute distributions of :math:`\theta` and :math:`\mu`
+separately conditional on various subsets of test scores.
+
+It will be fun to compare outcomes with the help of an auxiliary function
+``cond_dist_IQ2d`` that we now construct.
+
+.. code-block:: python3
+
+    def cond_dist_IQ2d(μ, Σ, data):
+    
+        n = len(μ)
+    
+        multi_normal = MultivariateNormal(μ, Σ)
+        multi_normal.partition(n-1)
+        μ_hat, Σ_hat = multi_normal.cond_dist(1, data)
+    
+        return μ_hat, Σ_hat
+
+Let’s see how things work for an example.
+
+.. code-block:: python3
+
+    for indices, IQ, conditions in [([*range(2*n), 2*n], 'θ', 'y1, y2, y3, y4'),
+                                    ([*range(n), 2*n], 'θ', 'y1, y2'),
+                                    ([*range(n, 2*n), 2*n], 'θ', 'y3, y4'),
+                                    ([*range(2*n), 2*n+1], 'η', 'y1, y2, y3, y4'),
+                                    ([*range(n), 2*n+1], 'η', 'y1, y2'),
+                                    ([*range(n, 2*n), 2*n+1], 'η', 'y3, y4')]:
+    
+        μ_hat, Σ_hat = cond_dist_IQ2d(μ_IQ2d[indices], Σ_IQ2d[indices][:, indices], x[indices[:-1]])
+        print(f'The mean and variance of {IQ} conditional on {conditions: <15} are ' + 
+              f'{μ_hat[0]:1.2f} and {Σ_hat[0, 0]:1.2f} respectively')
+
+Evidently, math tests provide no information about :math:`\mu` and
+language tests provide no information about :math:`\eta`.
+
+Univariate Time Series Analysis
+================================
+
+We can use the multivariate normal distribution and a little matrix
+algebra to present foundations of univariate linear time series
+analysis.
+
+Let :math:`x_t, y_t, v_t, w_{t+1}` each be scalars for :math:`t \geq 0`.
+
+Consider the following model:
+
+.. math::
+
+   \begin{aligned}
+   x_0 & \sim  N\left(0, \sigma_0^2\right) \\
+   x_{t+1} & = a x_{t} + b w_{t+1}, \quad w_{t+1} \sim N\left(0, 1\right), t \geq 0  \\
+   y_{t} & = c x_{t} + d v_{t}, \quad v_{t} \sim N\left(0, 1\right), t \geq 0
+   \end{aligned} 
+
+We can compute the moments of :math:`x_{t}`
+
+1. :math:`E x_{t+1}^2 = a^2 E x_{t}^2 + b^2, t \geq 0`, where
+   :math:`E x_{0}^2 = \sigma_{0}^2`
+2. :math:`E x_{t+j} x_{t} = a^{j} E x_{t}^2, \forall t \ \forall j`
+
+Given some :math:`T`, we can formulate the sequence
+:math:`\{x_{t}\}_{t=0}^T` as a random vector
+
+.. math::
+
+
+   X=\left[\begin{array}{c}
+   x_{0}\\
+   x_{1}\\
+   \vdots\\
+   x_{T}
+   \end{array}\right]
+
+and the covariance matrix :math:`\Sigma_{x}` can be constructed using
+the moments we have computed above.
+
+Similarly, we can define
+
+.. math::
+
+
+   Y=\left[\begin{array}{c}
+   y_{0}\\
+   y_{1}\\
+   \vdots\\
+   y_{T}
+   \end{array}\right], \quad
+   v=\left[\begin{array}{c}
+   v_{0}\\
+   v_{1}\\
+   \vdots\\
+   v_{T}
+   \end{array}\right]
+
+and therefore
+
+.. math::
+
+
+   Y = C X + D V
+
+where :math:`C` and :math:`D` are both diagonal matrices with constant
+:math:`c` and :math:`d` as diagonal respectively.
+
+Consequently, the covariance matrix of :math:`Y` is
+
+.. math::
+
+
+   \Sigma_{y} = E Y Y^{\prime} = C \Sigma_{x} C^{\prime} + D D^{\prime}
+
+By stacking :math:`X` and :math:`Y`, we can write
+
+.. math::
+
+
+   Z=\left[\begin{array}{c}
+   X\\
+   Y
+   \end{array}\right]
+
+and
+
+.. math::
+
+
+   \Sigma_{z} = EZZ^{\prime}=\left[\begin{array}{cc}
+   \Sigma_{x} & \Sigma_{x}C^{\prime}\\
+   C\Sigma_{x} & \Sigma_{y}
+   \end{array}\right]
+
+Thus, the stacked sequences :math:`\{x_{t}\}_{t=0}^T` and
+:math:`\{y_{t}\}_{t=0}^T` jointly follow the multivariate normal
+distribution :math:`N\left(0, \Sigma_{z}\right)`.
+
+.. code-block:: python3
+
+    # as an example, consider the case where T = 3
+    T = 3
+
+.. code-block:: python3
+
+    # variance of the initial distribution x_0
+    σ0 = 1.
+    
+    # parameters of the equation system
+    a = .9
+    b = 1.
+    c = 1.0
+    d = .05
+
+.. code-block:: python3
+
+    # construct the covariance matrix of X
+    Σx = np.empty((T+1, T+1))
+    
+    Σx[0, 0] = σ0 ** 2
+    for i in range(T):
+        Σx[i, i+1:] = Σx[i, i] * a ** np.arange(1, T+1-i)
+        Σx[i+1:, i] = Σx[i, i+1:]
+    
+        Σx[i+1, i+1] = a ** 2 * Σx[i, i] + b ** 2
+
+.. code-block:: python3
+
+    Σx
+
+.. code-block:: python3
+
+    # construct the covariance matrix of Y
+    C = np.eye(T+1) * c
+    D = np.eye(T+1) * d
+    
+    Σy = C @ Σx @ C.T + D @ D.T
+
+.. code-block:: python3
+
+    # construct the covariance matrix of Z
+    Σz = np.empty((2*(T+1), 2*(T+1)))
+    
+    Σz[:T+1, :T+1] = Σx
+    Σz[:T+1, T+1:] = Σx @ C.T
+    Σz[T+1:, :T+1] = C @ Σx
+    Σz[T+1:, T+1:] = Σy
+
+.. code-block:: python3
+
+    Σz
+
+.. code-block:: python3
+
+    # construct the mean vector of Z
+    μz = np.zeros(2*(T+1))
+
+The following Python code lets us sample random vectors :math:`X` and
+:math:`Y`.
+
+This is going to be very useful for doing the conditioning to be used in
+the fun exercises below.
+
+.. code-block:: python3
+
+    z = np.random.multivariate_normal(μz, Σz)
+    
+    x = z[:T+1]
+    y = z[T+1:]
+
+Smoothing Example
+------------------
+
+This is an instance of a classic ``smoothing`` calculation whose purpose
+is to compute :math:`E X \mid Y`.
+
+An interpretation of this example is
+
+-  :math:`X` is a random sequence of hidden Markov state variables
+   :math:`x_t`
+
+-  :math:`Y` is a sequence of observed signals :math:`y_t` bearing
+   information about the hidden state
+
+.. code-block:: python3
+
+    # construct a MultivariateNormal instance
+    multi_normal_ex1 = MultivariateNormal(μz, Σz)
+    x = z[:T+1]
+    y = z[T+1:]
+
+.. code-block:: python3
+
+    # partition Z into X and Y
+    multi_normal_ex1.partition(T+1)
+
+.. code-block:: python3
+
+    # compute the conditional mean and covariance matrix of X given Y=y
+    
+    print("X = ", x)
+    print("Y = ", y)
+    print(" E [ X | Y] = ", )
+    
+    multi_normal_ex1.cond_dist(0, y)
+
+Filtering Exercise
+--------------------
+
+Compute :math:`E\left[x_{t} \mid y_{t-1}, y_{t-2}, \dots, y_{0}\right]`.
+
+To do so, we need to first construct the mean vector and the covariance
+matrix of the subvector
+:math:`\left[x_{t}, y_{0}, \dots, y_{t-2}, y_{t-1}\right]`.
+
+For example, let’s say that we want the conditional distribution of
+:math:`x_{3}`.
+
+.. code-block:: python3
+
+    t = 3
+
+.. code-block:: python3
+
+    # mean of the subvector
+    sub_μz = np.zeros(t+1)
+    
+    # covariance matrix of the subvector
+    sub_Σz = np.empty((t+1, t+1))
+    
+    sub_Σz[0, 0] = Σz[t, t] # x_t
+    sub_Σz[0, 1:] = Σz[t, T+1:T+t+1]
+    sub_Σz[1:, 0] = Σz[T+1:T+t+1, t]
+    sub_Σz[1:, 1:] = Σz[T+1:T+t+1, T+1:T+t+1]
+
+.. code-block:: python3
+
+    sub_Σz
+
+.. code-block:: python3
+
+    multi_normal_ex2 = MultivariateNormal(sub_μz, sub_Σz)
+    multi_normal_ex2.partition(1)
+
+.. code-block:: python3
+
+    sub_y = y[:t]
+    
+    multi_normal_ex2.cond_dist(0, sub_y)
+
+Prediction Exercise
+--------------------
+
+Compute :math:`E\left[y_{t} \mid y_{t-j}, \dots, y_{0} \right]`.
+
+As what we did in exercise 2, we will construct the mean vector and
+covariance matrix of the subvector
+:math:`\left[y_{t}, y_{0}, \dots, y_{t-j-1}, y_{t-j} \right]`.
+
+For example, we take a case in which :math:`t=3` and :math:`j=2`.
+
+.. code-block:: python3
+
+    t = 3
+    j = 2
+
+.. code-block:: python3
+
+    sub_μz = np.zeros(t-j+2)
+    sub_Σz = np.empty((t-j+2, t-j+2))
+    
+    sub_Σz[0, 0] = Σz[T+t+1, T+t+1]
+    sub_Σz[0, 1:] = Σz[T+t+1, T+1:T+t-j+2]
+    sub_Σz[1:, 0] = Σz[T+1:T+t-j+2, T+t+1]
+    sub_Σz[1:, 1:] = Σz[T+1:T+t-j+2, T+1:T+t-j+2]
+
+.. code-block:: python3
+
+    sub_Σz
+
+.. code-block:: python3
+
+    multi_normal_ex3 = MultivariateNormal(sub_μz, sub_Σz)
+    multi_normal_ex3.partition(1)
+
+.. code-block:: python3
+
+    sub_y = y[:t-j+1]
+    
+    multi_normal_ex3.cond_dist(0, sub_y)
+
+Constructing a Wold Representation
+-------------------------------------
+
+Now we’ll apply Cholesky decomposition to decompose
+:math:`\Sigma_{y}=H H^{\prime}` and form
+
+.. math::
+
+
+   \epsilon = H^{-1} Y.
+
+Then we can represent :math:`y_{t}` as
+
+.. math::
+
+
+   y_{t} = h_{t,t} \epsilon_{t} + h_{t,t-1} \epsilon_{t-1} + \dots + h_{t,0} \epsilon_{0}.
+
+.. code-block:: python3
+
+    H = np.linalg.cholesky(Σy)
+    
+    H
+
+.. code-block:: python3
+
+    ε = np.linalg.inv(H) @ y
+    
+    ε
+
+.. code-block:: python3
+
+    y
+
+This example is an instance of what is known as a **Wold representation** in time series analysis.    
+
+Classic Factor Analysis Model
+======================================================
+
+The factor analysis model widely used in psychology and other fields can
+be represented as
+
+.. math::
+
+
+   Y = \Lambda f + U
+
+where
+
+1. :math:`Y` is :math:`n \times 1` random vector,
+   :math:`E U U^{\prime} = D` is a diagonal matrix,
+2. :math:`\Lambda` is :math:`n \times k` coefficient matrix,
+3. :math:`f` is :math:`k \times 1` random vector,
+   :math:`E f f^{\prime} = I`,
+4. :math:`U` is :math:`n \times 1` random vector, and :math:`U \perp f`.
+5. It is presumed that :math:`k` is small relative to :math:`n`; often
+   :math:`k` is only :math:`1` or :math:`2`, as in our IQ examples.
+
+This implies that
+
+.. math::
+
+   \begin{aligned}
+   \Sigma_y = E Y Y^{\prime} = \Lambda \Lambda^{\prime} + D \\
+   E Y f^{\prime} = \Lambda \\
+   E f Y^{\prime} = \Lambda^{\prime}
+   \end{aligned}
+
+Thus, the covariance matrix :math:`\Sigma_Y` is the sum of a diagonal
+matrix :math:`D` and a positive semi-definite matrix
+:math:`\Lambda \Lambda^{\prime}` of rank :math:`k`.
+
+This means that all covariances among the :math:`n` components of the
+:math:`Y` vector are intermediated by their common dependencies on the
+:math:`k<` factors.
+
+Form
+
+.. math::
+
+
+   Z=\left(\begin{array}{c}
+   f\\
+   Y
+   \end{array}\right)
+
+the covariance matrix of the expanded random vector :math:`Z` can be
+computed as
+
+.. math::
+
+
+   \Sigma_{z} = EZZ^{\prime}=\left(\begin{array}{cc}
+   I & \Lambda^{\prime}\\
+   \Lambda & \Lambda\Lambda^{\prime}+D
+   \end{array}\right)
+
+In the following, we first construct the mean vector and the covariance
+matrix for the case where :math:`N=10` and :math:`k=2`.
+
+.. code-block:: python3
+
+    N = 10
+    k = 2
+
+We set the coefficient matrix :math:`\Lambda` and the covariance matrix
+of :math:`U` to be
+
+.. math::
+
+
+   \Lambda=\left(\begin{array}{cc}
+   1 & 0\\
+   \vdots & \vdots\\
+   1 & 0\\
+   0 & 1\\
+   \vdots & \vdots\\
+   0 & 1
+   \end{array}\right),\quad D=\left(\begin{array}{cccc}
+   \sigma_{u}^{2} & 0 & \cdots & 0\\
+   0 & \sigma_{u}^{2} & \cdots & 0\\
+   \vdots & \vdots & \vdots & \vdots\\
+   0 & 0 & \cdots & \sigma_{u}^{2}
+   \end{array}\right)
+
+where the first half of the first column of :math:`\Lambda` is filled
+with :math:`1`\ s and :math:`0`\ s for the rest half, and symmetrically
+for the second column. :math:`D` is a diagonal matrix with parameter
+:math:`\sigma_{u}^{2}` on the diagonal.
+
+.. code-block:: python3
+
+    Λ = np.zeros((N, k))
+    Λ[:N//2, 0] = 1
+    Λ[N//2:, 1] = 1
+    
+    σu = .5
+    D = np.eye(N) * σu ** 2
+
+.. code-block:: python3
+
+    # compute Σy
+    Σy = Λ @ Λ.T + D
+
+We can now construct the mean vector and the covariance matrix for
+:math:`Z`.
+
+.. code-block:: python3
+
+    μz = np.zeros(k+N)
+    
+    Σz = np.empty((k+N, k+N))
+    
+    Σz[:k, :k] = np.eye(k)
+    Σz[:k, k:] = Λ.T
+    Σz[k:, :k] = Λ
+    Σz[k:, k:] = Σy
+
+.. code-block:: python3
+
+    z = np.random.multivariate_normal(μz, Σz)
+    
+    f = z[:k]
+    y = z[k:]
+
+.. code-block:: python3
+
+    multi_normal_factor = MultivariateNormal(μz, Σz)
+    multi_normal_factor.partition(k)
+
+Let’s compute the conditional distribution of the hidden factor
+:math:`f` on the observations :math:`Y`, namely, :math:`f \mid Y=y`.
+
+.. code-block:: python3
+
+    multi_normal_factor.cond_dist(0, y)
+
+We can verify that the conditional mean
+:math:`E \left[f \mid Y=y\right] = B Y` where
+:math:`B = \Lambda^{\prime} \Sigma_{y}^{-1}`.
+
+.. code-block:: python3
+
+    B = Λ.T @ np.linalg.inv(Σy)
+    
+    B @ y
+
+Similarly, we can compute the conditional distribution :math:`Y \mid f`.
+
+.. code-block:: python3
+
+    multi_normal_factor.cond_dist(1, f)
+
+It can be verified that the mean is
+:math:`\Lambda I^{-1} f = \Lambda f`.
+
+.. code-block:: python3
+
+    Λ @ f
+
+PCA as Approximation to Factor Analytic Model
+==============================================
+
+
+For fun, let’s apply a Principal Components Analysis (PCA) decomposition
+to a covariance matrix :math:`\Sigma_y` that in fact is governed by our factor-analytic
+model.
+
+Technically, this means that the PCA model is misspecified. (Can you
+explain why?)
+
+Nevertheless, this exercise will let us study how well the first two
+principal components from a PCA can approximate the conditional
+expectations :math:`E f_i | Y` for our two factors :math:`f_i`,
+:math:`i=1,2` for the factor analytic model that we have assumed truly
+governs the data on :math:`Y` we have generated.
+
+So we compute the PCA decomposition
+
+.. math::
+
+
+   \Sigma_{y} = P \tilde{\Lambda} P^{\prime}
+
+where :math:`\tilde{\Lambda}` is a diagonal matrix.
+
+We have
+
+.. math::
+
+
+   Y = P \epsilon
+
+and
+
+.. math::  \epsilon = P^\prime Y 
+
+Note that we will arrange the eigenvectors in :math:`P` in the
+*descending* order of eigenvalues.
+
+.. code-block:: python3
+
+    𝜆_tilde, P = np.linalg.eigh(Σy)
+    
+    # arrange the eigenvectors by eigenvalues
+    ind = sorted(range(N), key=lambda x: 𝜆_tilde[x], reverse=True)
+    
+    P = P[:, ind]
+    𝜆_tilde = 𝜆_tilde[ind]
+    Λ_tilde = np.diag(𝜆_tilde)
+    
+    print('𝜆_tilde =', 𝜆_tilde)
+
+.. code-block:: python3
+
+    # verify the orthogonality of eigenvectors
+    np.abs(P @ P.T - np.eye(N)).max()
+
+.. code-block:: python3
+
+    # verify the eigenvalue decomposition is correct
+    P @ Λ_tilde @ P.T
+
+.. code-block:: python3
+
+    ε = P.T @ y
+    
+    print("ε = ", ε)
+
+.. code-block:: python3
+
+    # print the values of the two factors
+    
+    print('f = ', f)
+
+Below we’ll plot several things
+
+-  the :math:`N` values of :math:`y`
+
+-  the :math:`N` values of the principal components :math:`\epsilon`
+
+-  the value of the first factor :math:`f_1` plotted only for the first
+   :math:`N/2` observations of :math:`y` for which it receives a
+   non-zero loading in :math:`\Lambda`
+
+-  the value of the second factor :math:`f_2` plotted only for the final
+   :math:`N/2` observations for which it receives a non-zero loading in
+   :math:`\Lambda`
+
+.. code-block:: python3
+
+    plt.scatter(range(N), y, label='y')
+    plt.scatter(range(N), ε, label='$\epsilon$')
+    plt.hlines(f[0], 0, N//2-1, ls='--', label='$f_{1}$')
+    plt.hlines(f[1], N//2, N-1, ls='-.', label='$f_{2}$')
+    plt.legend()
+    
+    plt.show()
+
+Consequently, the first two :math:`\epsilon_{j}` correspond to the
+largest two eigenvalues.
+
+Let’s look at them, after which we’ll look at :math:`E f | y = B y`
+
+.. code-block:: python3
+
+    ε[:2]
+
+.. code-block:: python3
+
+    # compare with Ef|y
+    B @ y
+
+The fraction of variance in :math:`y_{t}` explained by the first two
+principal components can be computed as below.
+
+.. code-block:: python3
+
+    𝜆_tilde[:2].sum() / 𝜆_tilde.sum()
+
+Compute
+
+.. math::
+
+
+   \hat{Y} = P_{j} \epsilon_{j} + P_{k} \epsilon_{k}
+
+where :math:`P_{j}` and :math:`P_{k}` correspond to the largest two
+eigenvalues.
+
+.. code-block:: python3
+
+    y_hat = P[:, :2] @ ε[:2]
+
+In this example, it turns out that the projection :math:`\hat{Y}` of
+:math:`Y` on the first two principal components does a good job of
+approximating :math:`Ef \mid y`.
+
+We confirm this in the following plot of :math:`f`,
+:math:`E y \mid f`, :math:`E f \mid y`, and :math:`\hat{y}` on the
+coordinate axis versus :math:`y` on the ordinate axis.
+
+.. code-block:: python3
+
+    plt.scatter(range(N), Λ @ f, label='$Ey|f$')
+    plt.scatter(range(N), y_hat, label='$\hat{y}$')
+    plt.hlines(f[0], 0, N//2-1, ls='--', label='$f_{1}$')
+    plt.hlines(f[1], N//2, N-1, ls='-.', label='$f_{2}$')
+    
+    Efy = B @ y
+    plt.hlines(Efy[0], 0, N//2-1, ls='--', color='b', label='$Ef_{1}|y$')
+    plt.hlines(Efy[1], N//2, N-1, ls='-.', color='b', label='$Ef_{2}|y$')
+    plt.legend()
+    
+    plt.show()
+
+The covariance matrix of :math:`\hat{Y}` can be computed by first
+constructing the covariance matrix of :math:`\epsilon` and then use the
+upper left block for :math:`\epsilon_{1}` and :math:`\epsilon_{2}`.
+
+.. code-block:: python3
+
+    Σεjk = (P.T @ Σy @ P)[:2, :2]
+    
+    Pjk = P[:, :2]
+    
+    Σy_hat = Pjk @ Σεjk @ Pjk.T
+    print('Σy_hat = \n', Σy_hat)
+
+Stochastic Difference Equation
+================================
+
+Consider the stochastic second-order linear difference equation
+
+.. math::
+
+
+   y_{t} = \alpha_{0} + \alpha_{1} y_{y-1} + \alpha_{2} y_{t-2} + u_{t}
+
+where :math:`u_{t} \sim N \left(0, \sigma_{u}^{2}\right)` and
+
+.. math::
+
+
+   \left[\begin{array}{c}
+   y_{-1}\\
+   y_{0}
+   \end{array}\right]\sim N\left(\mu_{\tilde{y}},\Sigma_{\tilde{y}}\right)
+
+It can be written as a stacked system
+
+.. math::
+
+
+   \underset{\equiv A}{\underbrace{\left[\begin{array}{cccccccc}
+   1 & 0 & 0 & 0 & \cdots & 0 & 0 & 0\\
+   -\alpha_{1} & 1 & 0 & 0 & \cdots & 0 & 0 & 0\\
+   -\alpha_{2} & -\alpha_{1} & 1 & 0 & \cdots & 0 & 0 & 0\\
+   0 & -\alpha_{2} & -\alpha_{1} & 1 & \cdots & 0 & 0 & 0\\
+   \vdots & \vdots & \vdots & \vdots & \cdots & \vdots & \vdots & \vdots\\
+   0 & 0 & 0 & 0 & \cdots & -\alpha_{2} & -\alpha_{1} & 1
+   \end{array}\right]}}\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   y_{3}\\
+   y_{4}\\
+   \vdots\\
+   y_{T}
+   \end{array}\right]=\underset{\equiv b}{\underbrace{\left[\begin{array}{c}
+   \alpha_{0}+\alpha_{1}y_{0}+\alpha_{2}y_{-1}\\
+   \alpha_{0}+\alpha_{2}y_{0}\\
+   \alpha_{0}\\
+   \alpha_{0}\\
+   \vdots\\
+   \alpha_{0}
+   \end{array}\right]}}
+
+We can compute :math:`y` by solving the system
+
+.. math::
+
+
+   y = A^{-1} \left(b + u\right)
+
+We have
+
+.. math::
+
+   \begin{aligned}
+   \mu_{y} = A^{-1} \mu_{b} \\
+   \Sigma_{y} &= A^{-1} E \left[\left(b - \mu_{b} + u \right) \left(b - \mu_{b} + u \right)^{\prime}\right] \left(A^{-1}\right)^{\prime} \\
+              &= A^{-1} \left(\Sigma_{b} + \Sigma_{u} \right) \left(A^{-1}\right)^{\prime}
+   \end{aligned}
+
+where
+
+.. math::
+
+
+   \mu_{b}=\left[\begin{array}{c}
+   \alpha_{0}+\alpha_{1}\mu_{y_{0}}+\alpha_{2}\mu_{y_{-1}}\\
+   \alpha_{0}+\alpha_{2}\mu_{y_{0}}\\
+   \alpha_{0}\\
+   \vdots\\
+   \alpha_{0}
+   \end{array}\right]
+
+.. math::
+
+
+   \Sigma_{b}=\left[\begin{array}{cc}
+   C\Sigma_{\tilde{y}}C^{\prime} & \boldsymbol{0}_{N-2\times N-2}\\
+   \boldsymbol{0}_{N-2\times2} & \boldsymbol{0}_{N-2\times N-2}
+   \end{array}\right],\quad C=\left[\begin{array}{cc}
+   \alpha_{2} & \alpha_{1}\\
+   0 & \alpha_{2}
+   \end{array}\right]
+
+.. math::
+
+
+   \Sigma_{u}=\left[\begin{array}{cccc}
+   \sigma_{u}^{2} & 0 & \cdots & 0\\
+   0 & \sigma_{u}^{2} & \cdots & 0\\
+   \vdots & \vdots & \vdots & \vdots\\
+   0 & 0 & \cdots & \sigma_{u}^{2}
+   \end{array}\right]
+
+.. code-block:: python3
+
+    # set parameters
+    T = 80
+    T = 160
+    # coefficients of the second order difference equation
+    𝛼0 = 10
+    𝛼1 = 1.53
+    𝛼2 = -.9
+    
+    # variance of u
+    σu = 1.
+    σu = 10.
+    
+    # distribution of y_{-1} and y_{0}
+    μy_tilde = np.array([1., 0.5])
+    Σy_tilde = np.array([[2., 1.], [1., 0.5]])
+
+.. code-block:: python3
+
+    # construct A and A^{\prime}
+    A = np.zeros((T, T))
+    
+    for i in range(T):
+        A[i, i] = 1
+    
+        if i-1 >= 0:
+            A[i, i-1] = -𝛼1
+    
+        if i-2 >= 0:
+            A[i, i-2] = -𝛼2
+    
+    A_inv = np.linalg.inv(A)
+
+.. code-block:: python3
+
+    # compute the mean vectors of b and y
+    μb = np.ones(T) * 𝛼0
+    μb[0] += 𝛼1 * μy_tilde[1] + 𝛼2 * μy_tilde[0]
+    μb[1] += 𝛼2 * μy_tilde[1]
+    
+    μy = A_inv @ μb
+
+.. code-block:: python3
+
+    # compute the covariance matrices of b and y
+    Σu = np.eye(T) * σu ** 2
+    
+    Σb = np.zeros((T, T))
+    
+    C = np.array([[𝛼2, 𝛼1], [0, 𝛼2]])
+    Σb[:2, :2] = C @ Σy_tilde @ C.T
+    
+    Σy = A_inv @ (Σb + Σu) @ A_inv.T
+
+Application to Stock Price Model
+=================================
+
+Let
+
+.. math::
+
+
+   p_{t} = \sum_{j=0}^{T-t} \beta^{j} y_{t+j}
+
+Form
+
+.. math::
+
+
+   \underset{\equiv p}{\underbrace{\left[\begin{array}{c}
+   p_{1}\\
+   p_{2}\\
+   p_{3}\\
+   \vdots\\
+   p_{T}
+   \end{array}\right]}}=\underset{\equiv B}{\underbrace{\left[\begin{array}{ccccc}
+   1 & \beta & \beta^{2} & \cdots & \beta^{T-1}\\
+   0 & 1 & \beta & \cdots & \beta^{T-2}\\
+   0 & 0 & 1 & \cdots & \beta^{T-3}\\
+   \vdots & \vdots & \vdots & \vdots & \vdots\\
+   0 & 0 & 0 & \cdots & 1
+   \end{array}\right]}}\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   y_{3}\\
+   \vdots\\
+   y_{T}
+   \end{array}\right]
+
+we have
+
+.. math::
+
+   \begin{aligned}
+   \mu_{p} = B \mu_{y} \\
+   \Sigma_{p} = B \Sigma_{y} B^{\prime}
+   \end{aligned}
+
+.. code-block:: python3
+
+    β = .96
+
+.. code-block:: python3
+
+    # construct B
+    B = np.zeros((T, T))
+    
+    for i in range(T):
+        B[i, i:] = β ** np.arange(0, T-i)
+
+Denote
+
+.. math::
+
+
+   z=\left[\begin{array}{c}
+   y\\
+   p
+   \end{array}\right]=\underset{\equiv D}{\underbrace{\left[\begin{array}{c}
+   I\\
+   B
+   \end{array}\right]}} y
+
+Thus, :math:`\{y_t\}_{t=1}^{T}` and :math:`\{p_t\}_{t=1}^{T}` jointly
+follow the multivariate normal distribution
+:math:`N \left(\mu_{z}, \Sigma_{z}\right)`, where
+
+.. math::
+
+
+   \mu_{z}=D\mu_{y}
+
+.. math::
+
+
+   \Sigma_{z}=D\Sigma_{y}D^{\prime}
+
+.. code-block:: python3
+
+    D = np.vstack([np.eye(T), B])
+
+.. code-block:: python3
+
+    μz = D @ μy
+    Σz = D @ Σy @ D.T
+
+We can simulate paths of :math:`y_{t}` and :math:`p_{t}` and compute the
+conditional mean :math:`E \left[p_{t} \mid y_{t-1}, y_{t}\right]` using
+the ``MultivariateNormal`` class.
+
+.. code-block:: python3
+
+    z = np.random.multivariate_normal(μz, Σz)
+    y, p = z[:T], z[T:]
+
+.. code-block:: python3
+
+    cond_Ep = np.empty(T-1)
+    
+    sub_μ = np.empty(3)
+    sub_Σ = np.empty((3, 3))
+    for t in range(2, T+1):
+        sub_μ[:] = μz[[t-2, t-1, T-1+t]]
+        sub_Σ[:, :] = Σz[[t-2, t-1, T-1+t], :][:, [t-2, t-1, T-1+t]]
+    
+        multi_normal = MultivariateNormal(sub_μ, sub_Σ)
+        multi_normal.partition(2)
+    
+        cond_Ep[t-2] = multi_normal.cond_dist(1, y[t-2:t])[0][0]
+
+.. code-block:: python3
+
+    plt.plot(range(1, T), y[1:], label='$y_{t}$')
+    plt.plot(range(1, T), y[:-1], label='$y_{t-1}$')
+    plt.plot(range(1, T), p[1:], label='$p_{t}$')
+    plt.plot(range(1, T), cond_Ep, label='$Ep_{t}|y_{t}, y_{t-1}$')
+    
+    plt.xlabel('t')
+    plt.legend(loc=1)
+    plt.show()
+
+In the above graph, the green line is what the price of the stock would
+be if people had perfect foresight about the path of dividends while the
+green line is the conditional expectation :math:`E p_t | y_t, y_{t-1}`, which is what the price would
+be if people did not have perfect foresight but were optimally
+predicting future dividends on the basis of the information
+:math:`y_t, y_{t-1}` at time :math:`t`.
+
+
+Filtering Foundations
+======================
+
+Assume that :math:`x_0` is an :math:`n \times 1` random vector and that
+:math:`y_0` is a :math:`p \times 1` random vector determined by the
+*observation equation*
+
+.. math::  y_0 = G x_0 + v_0  , \quad x_0 \sim {\mathcal N}(\hat x_0, \Sigma_0), \quad v_0 \sim {\mathcal N}(0, R) 
+
+where :math:`v_0` is orthogonal to :math:`x_0`, :math:`G` is a
+:math:`p \times n` matrix, and :math:`R` is a :math:`p \times p`
+positive definite matrix.
+
+We consider the problem of someone who *observes* :math:`y_0`, who does
+not observe :math:`x_0`, who knows :math:`\hat x_0, \Sigma_0, G, R` –
+and therefore knows the joint probability distribution of the vector
+:math:`\begin{bmatrix} x_0 \cr y_0 \end{bmatrix}` – and who wants to
+infer :math:`x_0` from :math:`y_0` in light of what he knows about that
+joint probability distribution.
+
+Therefore, the person wants to construct the probability distribution of
+:math:`x_0` conditional on the random vector :math:`y_0`.
+
+The joint distribution of
+:math:`\begin{bmatrix} x_0 \cr y_0 \end{bmatrix}` is multivariate normal
+:math:`{\mathcal N}(\mu, \Sigma)` with
+
+.. math::
+
+    \mu = \begin{bmatrix} \hat x_0 \cr G \hat x_0 \end{bmatrix} , \quad
+      \Sigma = \begin{bmatrix} \Sigma_0 & \Sigma_0 G' \cr
+                              G \Sigma_0 & G \Sigma_0 G' + R \end{bmatrix}
+
+By applying an appropriate instance of the above formulas for the  mean vector :math:`\hat \mu_1` and covariance matrix
+:math:`\hat \Sigma_{11}` of :math:`z_1` conditional on :math:`z_2`, we find that the probability distribution of
+:math:`x_0` conditional on :math:`y_0` is
+:math:`{\mathcal N}(\tilde x_0, \tilde \Sigma_0)` where
+
+.. math::
+
+    \begin{aligned} \beta_0  & = \Sigma_0 G' (G \Sigma_0 G' + R)^{-1} \cr
+   \tilde x_0 & = \hat x_0 + \beta_0 ( y_0 - G \hat x_0) \cr
+    \tilde \Sigma_0 & = \Sigma_0 - \Sigma_0 G' (G \Sigma_0 G' + R)^{-1} G \Sigma_0 
+     \end{aligned} 
+
+
+Step toward dynamics
+----------------------     
+
+Now suppose that we are in a time series setting and that we have the
+one-step state transition equation
+
+.. math::  x_1 = A x_0 + C w_1 ,  \quad w_1 \sim {\mathcal N}(0, I )
+
+where :math:`A` is an :math:`n \times n` matrix and :math:`C` is an
+:math:`n \times m` matrix.
+
+It follows that the probability distribution of :math:`x_1` conditional
+on :math:`y_0` is
+
+.. math::  x_1 | y_0 \sim {\mathcal N}(A \tilde x_0 , A \tilde \Sigma_0 A' + C C' ) 
+
+Define
+
+.. math::
+
+    \begin{aligned} \hat x_1 & = A \tilde x_0 \cr
+                  \Sigma_1 & = A \tilde \Sigma_0 A' + C C' 
+   \end{aligned} 
+
+Dynamic version
+------------------
+
+Suppose now that for :math:`t \geq 0`,
+:math:`\{x_{t+1}, y_t\}_{t=0}^\infty` are governed by the equations
+
+.. math::
+
+    \begin{aligned}
+   x_{t+1} & = A x_t + C w_{t+1} \cr
+   y_t & = G x_t + v_t 
+   \end{aligned} 
+
+where as before :math:`x_0 \sim {\mathcal N}(\hat x_0, \Sigma_0)`,
+:math:`w_{t+1}` is the :math:`t+1`\ th component of an i.i.d. stochastic
+process distributed as :math:`w_{t+1} \sim {\mathcal N}(0, I)`, and
+:math:`v_t` is the :math:`t`\ th component of an i.i.d. process
+distributed as :math:`v_t \sim {\mathcal N}(0, R)` and the
+:math:`\{w_{t+1}\}_{t=0}^\infty` and :math:`\{v_t\}_{t=0}^\infty`
+processes are orthogonal at all pairs of dates. 
+
+The logic and
+formulas that we applied above imply that the probability distribution
+of :math:`x_t` conditional on
+:math:`y_0, y_1, \ldots , y_{t-1} = y^{t-1}` is
+
+.. math::  x_t | y^{t-1} \sim {\mathcal N}(A \tilde x_t , A \tilde \Sigma_t A' + C C' ) 
+
+where :math:`\{\tilde x_t, \tilde \Sigma_t\}_{t=1}^\infty` can be
+computed by iterating on the following equations starting from
+:math:`t=1` and initial conditions for
+:math:`\tilde x_0, \tilde \Sigma_0` computed as we have above:
+
+.. math::
+
+    \begin{aligned} \Sigma_t & = A  \tilde \Sigma_{t-1} A' + C C' \cr
+                  \hat x_t & = A \tilde x_{t-1} \cr
+   \beta_t & = \Sigma_t G' (G \Sigma_t G' + R)^{-1} \cr
+   \tilde x_t & = \hat x_t + \beta_t ( y_t - G \hat x_t) \cr
+    \tilde \Sigma_t & = \Sigma_t - \Sigma_t G' (G \Sigma_t G' + R)^{-1} G \Sigma_t
+     \end{aligned} 
+
+We can use the Python class *MultivariateNormal* to construct examples.
+
+Here is an example for a single period problem at time :math:`0`
+
+.. code-block:: python3
+
+    G = np.array([[1., 3.]])
+    R = np.array([[1.]])
+    
+    x0_hat = np.array([0., 1.])
+    Σ0 = np.array([[1., .5], [.3, 2.]])
+    
+    μ = np.hstack([x0_hat, G @ x0_hat])
+    Σ = np.block([[Σ0, Σ0 @ G.T], [G @ Σ0, G @ Σ0 @ G.T + R]])
+
+.. code-block:: python3
+
+    # construction of the multivariate normal instance
+    multi_normal = MultivariateNormal(μ, Σ)
+
+.. code-block:: python3
+
+    multi_normal.partition(2)
+
+.. code-block:: python3
+
+    # the observation of y
+    y0 = 2.3
+    
+    # conditional distribution of x0
+    μ1_hat, Σ11 = multi_normal.cond_dist(0, y0)
+    μ1_hat, Σ11
+
+.. code-block:: python3
+
+    A = np.array([[0.5, 0.2], [-0.1, 0.3]])
+    C = np.array([[2.], [1.]])
+    
+    # conditional distribution of x1
+    x1_cond = A @ μ1_hat
+    Σ1_cond = C @ C.T + A @ Σ11 @ A.T
+    x1_cond, Σ1_cond
+
+Code for Iterating
+-------------------
+
+Here is code for solving a dynamic filtering problem by iterating on our
+equations, followed by an example.
+
+.. code-block:: python3
+
+    def iterate(x0_hat, Σ0, A, C, G, R, y_seq):
+    
+        p, n = G.shape
+    
+        T = len(y_seq)
+        x_hat_seq = np.empty((T+1, n))
+        Σ_hat_seq = np.empty((T+1, n, n))
+    
+        x_hat_seq[0] = x0_hat
+        Σ_hat_seq[0] = Σ0
+    
+        for t in range(T):
+            xt_hat = x_hat_seq[t]
+            Σt = Σ_hat_seq[t]
+            μ = np.hstack([xt_hat, G @ xt_hat])
+            Σ = np.block([[Σt, Σt @ G.T], [G @ Σt, G @ Σt @ G.T + R]])
+    
+            # filtering
+            multi_normal = MultivariateNormal(μ, Σ)
+            multi_normal.partition(n)
+            x_tilde, Σ_tilde = multi_normal.cond_dist(0, y_seq[t])
+    
+            # forecasting
+            x_hat_seq[t+1] = A @ x_tilde
+            Σ_hat_seq[t+1] = C @ C.T + A @ Σ_tilde @ A.T
+    
+        return x_hat_seq, Σ_hat_seq
+
+.. code-block:: python3
+
+    iterate(x0_hat, Σ0, A, C, G, R, [2.3, 1.2, 3.2])
+
+The iterative algorithm just described is a version of the celebrated **Kalman filter**.  
+
+We describe the Kalman filter  and some applications of it in :doc:`A First Look at the Kalman Filter <kalman>`
+
+
+
diff --git a/source/rst/navy_captain.rst b/source/rst/navy_captain.rst
index d2794b6..2988b62 100644
--- a/source/rst/navy_captain.rst
+++ b/source/rst/navy_captain.rst
@@ -15,7 +15,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 .. code-block:: ipython
@@ -23,7 +23,8 @@ In addition to what's in Anaconda, this lecture will need the following librarie
     import numpy as np
     import matplotlib.pyplot as plt
     %matplotlib inline
-    from numba import njit, prange, jitclass, float64, int64
+    from numba import njit, prange, float64, int64
+    from numba.experimental import jitclass
     from interpolation import interp
     from math import gamma
     from scipy.optimize import minimize
@@ -33,9 +34,9 @@ Overview
 
 This lecture follows up on ideas presented in the following lectures: 
 
- * :doc:`A Problem that Stumped Milton Friedman <wald_friedman>` 
- * :doc:`Exchangeability and Bayesian Updating <exchangeable>`  
- * :doc:`Likelihood Ratio Processes <likelihood_ratio_process>`
+* :doc:`A Problem that Stumped Milton Friedman <wald_friedman>` 
+* :doc:`Exchangeability and Bayesian Updating <exchangeable>`  
+* :doc:`Likelihood Ratio Processes <likelihood_ratio_process>`
 
 In :doc:`A Problem that Stumped Milton Friedman <wald_friedman>`  we described a problem
 that a Navy Captain presented to Milton Friedman during World War II.  
@@ -55,8 +56,8 @@ this lecture :doc:`Exchangeability and Bayesian Updating <exchangeable>` and in
 :doc:`Likelihood Ratio Processes <likelihood_ratio_process>`, which describes the link between Bayesian
 updating and likelihood ratio processes.  
 
-The present lecture  uses Python to generate simulations that   evaluate expected losses under  **frequentist** and **Bayesian**
-decision rules for a instances of the Navy Captain's decision problem.  
+The present lecture uses Python to generate simulations that evaluate expected losses under **frequentist** and **Bayesian**
+decision rules for an instance of the Navy Captain's decision problem.  
 
 The simulations validate the Navy Captain's hunch that there is a better rule than the one the Navy had ordered him
 to use.  
@@ -96,8 +97,8 @@ impose on him.
 The decision maker pays  a cost :math:`c` for drawing 
 another  :math:`z`
 
-We mainly borrow parameters from the quantecon lecture “A Problem that
-Stumped Milton Friedman” except that we increase both :math:`\bar L_{0}`
+We mainly borrow parameters from the quantecon lecture 
+:doc:`A Problem that Stumped Milton Friedman <wald_friedman>` except that we increase both :math:`\bar L_{0}`
 and :math:`\bar L_{1}` from :math:`25` to :math:`100` to encourage the
 frequentist Navy Captain to take more draws before deciding.
 
@@ -120,7 +121,7 @@ Below is some Python code that sets up these objects.
         return r * x**(a-1) * (1 - x)**(b-1)
 
 We start with defining a ``jitclass`` that stores parameters and
-functions we need to solve problems for both the bayesian and
+functions we need to solve problems for both the Bayesian and
 frequentist Navy Captains.
 
 .. code-block:: python3
@@ -262,7 +263,7 @@ Here
    not rejecting :math:`H_0` when :math:`H_1` is true
 
 For a given sample size :math:`t`, the pairs :math:`\left(PFA,PD\right)`
-lie on a “receiver operating characteristic curve” and can be uniquely
+lie on a **receiver operating characteristic curve** and can be uniquely
 pinned down by choosing :math:`d`.
 
 To see some receiver operating characteristic curves, please see this
@@ -289,7 +290,7 @@ generates data.
     plt.legend()
     plt.show()
 
-We can compute sequneces of likelihood ratios using simulated samples.
+We can compute sequences of likelihood ratios using simulated samples.
 
 .. code-block:: python3
 
@@ -304,7 +305,7 @@ We can compute sequneces of likelihood ratios using simulated samples.
     L1_arr = np.cumprod(l1_arr, 1)
 
 With an empirical distribution of likelihood ratios in hand, we can draw
-“receiver operating characteristic curves” by enumerating
+**receiver operating characteristic curves** by enumerating
 :math:`\left(PFA,PD\right)` pairs given each sample size :math:`t`.
 
 .. code-block:: python3
@@ -443,7 +444,7 @@ rule changes.
 
     plt.show()
 
-The following shows how do optimal sample size :math:`t` and targeted
+The following shows how optimal sample size :math:`t` and targeted
 :math:`\left(PFA,PD\right)` change as :math:`\pi^{*}` varies.
 
 .. code-block:: python3
@@ -465,7 +466,7 @@ The following shows how do optimal sample size :math:`t` and targeted
 Bayesian Decision Rule
 ========================
 
-In this lecture :doc:`A Problem that Stumped Milton Friedman <wald_friedman>`,
+In  :doc:`A Problem that Stumped Milton Friedman <wald_friedman>`,
 we learned how Abraham Wald confirmed the Navy
 Captain’s hunch that there is a better decision rule.
 
@@ -597,7 +598,7 @@ that computes :math:`\alpha` and :math:`\beta`.
     plt.legend(borderpad=1.1)
     plt.show()
 
-The above figure portrays the value function plotted against decision
+The above figure portrays the value function plotted against the decision
 maker’s Bayesian posterior.
 
 It also shows the probabilities :math:`\alpha` and :math:`\beta`.
@@ -611,34 +612,35 @@ The Bayesian decision rule is:
 -  delay deciding and draw another :math:`z` if
    :math:`\beta \leq \pi \leq \alpha`
 
-We can calculate two ‘’objective’’ loss functions under this situation
+We can calculate two “objective” loss functions under this situation
 conditioning on knowing for sure that nature has selected :math:`f_{0}`,
 in the first case, or :math:`f_{1}`, in the second case.
 
 1. under :math:`f_{0}`,
 
-.. math::
+    .. math::
 
 
-   V^{0}\left(\pi\right)=\begin{cases}
-   0 & \text{if }\alpha\leq\pi,\\
-   c+EV^{0}\left(\pi^{\prime}\right) & \text{if }\beta\leq\pi<\alpha,\\
-   \bar L_{1} & \text{if }\pi<\beta.
-   \end{cases}
+        V^{0}\left(\pi\right)=\begin{cases}
+        0 & \text{if }\alpha\leq\pi,\\
+        c+EV^{0}\left(\pi^{\prime}\right) & \text{if }\beta\leq\pi<\alpha,\\
+        \bar L_{1} & \text{if }\pi<\beta.
+        \end{cases}
 
 2. under :math:`f_{1}`
 
-.. math::
+    .. math::
 
 
-   V^{1}\left(\pi\right)=\begin{cases}
-   \bar L_{0} & \text{if }\alpha\leq\pi,\\
-   c+EV^{1}\left(\pi^{\prime}\right) & \text{if }\beta\leq\pi<\alpha,\\
-   0 & \text{if }\pi<\beta.
-   \end{cases}
+        V^{1}\left(\pi\right)=\begin{cases}
+        \bar L_{0} & \text{if }\alpha\leq\pi,\\
+        c+EV^{1}\left(\pi^{\prime}\right) & \text{if }\beta\leq\pi<\alpha,\\
+        0 & \text{if }\pi<\beta.
+        \end{cases}
 
 where
 :math:`\pi^{\prime}=\frac{\pi f_{0}\left(z^{\prime}\right)}{\pi f_{0}\left(z^{\prime}\right)+\left(1-\pi\right)f_{1}\left(z^{\prime}\right)}`.
+
 Given a prior probability :math:`\pi_{0}`, the expected loss for the
 Bayesian is
 
@@ -844,7 +846,7 @@ It is always positive.
 More details
 ============
 
-We can provide more insights by focusing soley the case in which
+We can provide more insights by focusing on the case in which
 :math:`\pi^{*}=0.5=\pi_{0}`.
 
 .. code-block:: python3
@@ -852,9 +854,9 @@ We can provide more insights by focusing soley the case in which
     π_star = 0.5
 
 Recall that when :math:`\pi^*=0.5`, the frequentist decision rule sets a
-sample size ``t_optimal`` **ex ante**
+sample size ``t_optimal`` **ex ante**.
 
-For our parameter settings, we can compute it’s value:
+For our parameter settings, we can compute its value:
 
 .. code-block:: python3
 
@@ -872,7 +874,7 @@ Distribution of Bayesian decision rule’s times to decide
 
 By using simulations, we compute the frequency distribution of time to
 deciding for the Bayesian decision rule and compare that time to the
-frequentist rule’sfixed :math:`t`.
+frequentist rule’s fixed :math:`t`.
 
 The following Python code creates a graph that shows the frequency
 distribution of Bayesian times to decide of Bayesian decision maker,
diff --git a/source/rst/odu.rst b/source/rst/odu.rst
index d568e07..d6042e8 100644
--- a/source/rst/odu.rst
+++ b/source/rst/odu.rst
@@ -16,7 +16,7 @@ In addition to what’s in Anaconda, this lecture deploys the libraries:
 .. code-block:: ipython
   :class: hide-output
 
-    !pip install --upgrade quantecon
+    !pip install quantecon
     !pip install interpolation
 
 Overview
@@ -72,7 +72,7 @@ want to consider.
 The Basic McCall Model
 ~~~~~~~~~~~~~~~~~~~~~~
 
-Recall that, `in the baseline model <mccall_model>`, an
+Recall that, :doc:`in the baseline model <mccall_model>`, an
 unemployed worker is presented in each period with a permanent job offer
 at wage :math:`W_t`.
 
@@ -444,7 +444,7 @@ We will also plot the optimal policy
     plt.show()
 
 The results fit well with our intuition from section `looking
-forward <#looking-forward>`__.
+forward <#Looking-Forward>`__.
 
 -  The black line in the figure above corresponds to the function
    :math:`\bar w(\pi)` introduced there.
@@ -670,7 +670,7 @@ Use the default parameters and ``Q_factory`` to compute an optimal
 policy.
 
 Your result should coincide closely with the figure for the optimal
-policy `shown above <#odu-pol-vfi>`__.
+policy `shown above <#Take-1:-Solution-by-VFI>`__.
 
 Try experimenting with different parameters, and confirm that the change
 in the optimal policy coincides with your intuition.
@@ -1080,10 +1080,10 @@ not, when the actual ruling distribution is :math:`g` instead of
 Two countervailing effects are at work.
 
 
--  if f generates successive wage offers, then :math:`w` is more likely to be low, but
+-  if :math:`f` generates successive wage offers, then :math:`w` is more likely to be low, but
    :math:`\pi` is moving up toward to 1, which lowers the reservation wage,
    i.e., the worker becomes  less selective the longer he or she remains unemployed.
--  if g generates wage offers, then :math:`w` is more likely to be high, but
+-  if :math:`g` generates wage offers, then :math:`w` is more likely to be high, but
    :math:`\pi` is moving downward toward 0, increasing the reservation wage, i.e., the worker becomes  more selective
    the longer he or she remains unemployed.
 
diff --git a/source/rst/optgrowth.rst b/source/rst/optgrowth.rst
index 55ab548..a15f186 100644
--- a/source/rst/optgrowth.rst
+++ b/source/rst/optgrowth.rst
@@ -355,7 +355,7 @@ function.
 
 In our setting, we have the following key result
 
-    * A feasible consumption policy is optimal if and only if it is :math:`v^*`-greedy.
+* A feasible consumption policy is optimal if and only if it is :math:`v^*`-greedy.
 
 The intuition is similar to the intuition for the Bellman equation, which was
 provided after :eq:`fpb30`.
@@ -859,7 +859,7 @@ utility specification.
 Setting :math:`\gamma = 1.5`, compute and plot an estimate of the optimal policy.
 
 
-Time how long this function takes to run, so you can compare it to faster code developed in the :doc:`next lecture <optgrowth_fast>`
+Time how long this function takes to run, so you can compare it to faster code developed in the :doc:`next lecture <optgrowth_fast>`.
 
 
 .. _og_ex2:
diff --git a/source/rst/optgrowth_fast.rst b/source/rst/optgrowth_fast.rst
index b3e6466..d2d3531 100644
--- a/source/rst/optgrowth_fast.rst
+++ b/source/rst/optgrowth_fast.rst
@@ -16,7 +16,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 Overview
@@ -50,7 +50,8 @@ Let's start with some imports:
     import numpy as np
     import matplotlib.pyplot as plt
     from interpolation import interp
-    from numba import jit, njit, jitclass, prange, float64, int32
+    from numba import jit, njit, prange, float64, int32
+    from numba.experimental import jitclass
     from quantecon.optimize.scalar_maximization import brent_max
 
     %matplotlib inline
diff --git a/source/rst/pandas_panel.rst b/source/rst/pandas_panel.rst
index cb4cb3e..a33bafd 100644
--- a/source/rst/pandas_panel.rst
+++ b/source/rst/pandas_panel.rst
@@ -368,7 +368,7 @@ Grouping and summarizing data can be particularly useful for
 understanding large panel datasets.
 
 A simple way to summarize data is to call an `aggregation
-method <https://pandas.pydata.org/pandas-docs/stable/getting_started/basics.html#descriptive-statistics>`__
+method <https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/06_calculate_statistics.html>`__
 on the dataframe, such as ``.mean()`` or ``.max()``.
 
 For example, we can calculate the average real minimum wage for each
@@ -510,6 +510,7 @@ object
 
     plt.title('Real minimum wages in 2015')
     plt.xlabel('US dollars')
+    plt.legend()
     plt.show()
 
 
diff --git a/source/rst/perm_income.rst b/source/rst/perm_income.rst
index 4a7cd7f..120c924 100644
--- a/source/rst/perm_income.rst
+++ b/source/rst/perm_income.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -188,7 +188,7 @@ Regarding preferences, we assume the quadratic utility function
     u(c_t) =  - (c_t - \gamma)^2
 
 
-where :math:`\gamma` is a bliss level of consumption
+where :math:`\gamma` is a bliss level of consumption.
 
 .. note::
     Along with this quadratic utility specification, we allow consumption to be negative.  However, by choosing parameters appropriately, we can make the probability that the model generates negative consumption paths over finite time horizons as low as desired.
@@ -229,7 +229,7 @@ With our quadratic preference specification, :eq:`sprob4` has the striking impli
     \mathbb{E}_t [c_{t+1}] = c_t
 
 
-(In fact, quadratic preferences are *necessary* for this conclusion [#f2]_)
+(In fact, quadratic preferences are *necessary* for this conclusion [#f2]_.)
 
 One way to interpret :eq:`sprob5` is that consumption will change only when
 "new information" about permanent income is revealed.
@@ -243,7 +243,7 @@ The Optimal Decision Rule
 -------------------------
 
 
-Now let's deduce the optimal decision rule [#fod]_
+Now let's deduce the optimal decision rule [#fod]_.
 
 .. note::
 
@@ -475,7 +475,7 @@ A Simple Example with IID Income
 
 To gain some preliminary intuition on the implications of :eq:`pi_ssr`, let's look at a highly stylized example where income is just IID.
 
-(Later examples will investigate more realistic income streams)
+(Later examples will investigate more realistic income streams.)
 
 In particular, let :math:`\{w_t\}_{t = 1}^{\infty}` be IID and scalar standard normal, and let
 
@@ -661,13 +661,13 @@ Representation :eq:`sprob16abcd` makes clear that
 
 * The state can be taken as :math:`(c_t, z_t)`.
 
-    * The endogenous part is :math:`c_t` and the exogenous part is :math:`z_t`.
+  * The endogenous part is :math:`c_t` and the exogenous part is :math:`z_t`.
 
-    * Debt :math:`b_t` has disappeared as a component of the state because it is encoded in :math:`c_t`.
+  * Debt :math:`b_t` has disappeared as a component of the state because it is encoded in :math:`c_t`.
 
 * Consumption is a random walk with innovation :math:`(1-\beta) U  (I-\beta A)^{-1} C w_{t+1}`.
 
-    * This is a more explicit representation of the martingale result in :eq:`sprob5`.
+  * This is a more explicit representation of the martingale result in :eq:`sprob5`.
 
 
 .. _coint_pi:
@@ -1102,7 +1102,7 @@ You will be able to verify that the first-order condition is
 
 Using :math:`\beta R = 1` gives :eq:`sprob4` in the two-period case.
 
-The proof for the general case is similar
+The proof for the general case is similar.
 
 
 
diff --git a/source/rst/perm_income_cons.rst b/source/rst/perm_income_cons.rst
index 33aa9cc..a21c2af 100644
--- a/source/rst/perm_income_cons.rst
+++ b/source/rst/perm_income_cons.rst
@@ -18,7 +18,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -52,7 +52,7 @@ In this lecture, we'll
 We'll then use these characterizations to construct a simple model of cross-section wealth and
 consumption dynamics in the spirit of Truman Bewley :cite:`Bewley86`.
 
-(Later we'll study other Bewley models---see :doc:`this lecture <aiyagari>`)
+(Later we'll study other Bewley models---see :doc:`this lecture <aiyagari>`.)
 
 The model will prove useful for illustrating concepts such as
 
@@ -554,13 +554,13 @@ The examples differ only in  the initial states with which we endow the consumer
 
 All other parameter values are kept the same in the two examples
 
--  In the first example, all consumers begin with zero nonfinancial income and zero debt.
+- In the first example, all consumers begin with zero nonfinancial income and zero debt.
 
-    * The consumers are thus *ex-ante* identical.
+  * The consumers are thus *ex-ante* identical.
 
--  In the second example, while all begin with zero debt, we draw their initial income levels from the invariant distribution of financial income.
+- In the second example, while all begin with zero debt, we draw their initial income levels from the invariant distribution of financial income.
 
-    * Consumers are *ex-ante* heterogeneous.
+  * Consumers are *ex-ante* heterogeneous.
 
 In the first example, consumers' nonfinancial income paths  display
 pronounced transients early in the sample
@@ -869,7 +869,7 @@ Across the group of people being analyzed, risk-free loans are in zero excess su
 
 We have arranged primitives so that :math:`R = \beta^{-1}` clears the market for risk-free loans at zero aggregate excess supply.
 
-So the risk-free loans are being made from one person to another within our closed set of agent.
+So the risk-free loans are being made from one person to another within our closed set of agents.
 
 There is no need for foreigners to lend to our group.
 
diff --git a/source/rst/rational_expectations.rst b/source/rst/rational_expectations.rst
index f164c5f..a2acd43 100644
--- a/source/rst/rational_expectations.rst
+++ b/source/rst/rational_expectations.rst
@@ -17,7 +17,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 Overview
 ========
@@ -91,7 +91,7 @@ A Simple Static Example of the Big Y, Little y Trick
 
 Consider a static model in which a collection of :math:`n` firms produce a homogeneous good that is sold in a competitive market.
 
-Each of these :math:`n` firms sell output :math:`y`.
+Each of these :math:`n` firms sells output :math:`y`.
 
 The price :math:`p` of the good lies on an inverse demand curve
 
@@ -652,7 +652,7 @@ Recall the planner's problem :ref:`described above <ree_pp>`
 
 #. Solve it using the same parameter values in exercise 1
 
-    * :math:`a_0= 100, a_1= 0.05, \beta = 0.95, \gamma=10`
+   * :math:`a_0= 100, a_1= 0.05, \beta = 0.95, \gamma=10`
 
 #.  Represent the solution in the form :math:`Y_{t+1} = \kappa_0 + \kappa_1 Y_t`.
 
diff --git a/source/rst/re_with_feedback.rst b/source/rst/re_with_feedback.rst
index d971467..c392bc6 100644
--- a/source/rst/re_with_feedback.rst
+++ b/source/rst/re_with_feedback.rst
@@ -20,7 +20,7 @@ In addition to what's in Anaconda, this lecture deploys the following libraries:
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 .. code-block:: ipython
@@ -42,12 +42,12 @@ the price level to the money supply.
 
 Cagan did not use a rational expectations version of his model, but Sargent :cite:`Sargent77hyper` did.
 
-We study this model because it is intrinsically interesting and also because it  has a mathematical structure that 
-also appears in virtually all  linear rational expectations model, namely, that a key  endogenous variable equals
+We study a rational expectations version of this model because it is intrinsically interesting and  because it 
+has a mathematical structure that 
+appears in virtually all  linear rational expectations model, namely, that a key  endogenous variable equals
 a mathematical expectation of a geometric sum of future values of another variable.
 
-In a rational expectations version of Cagan's model, the endogenous variable is the price level or rate of inflation and 
-the other variable is the money supply or the rate of change in the money supply. 
+The model determines  the price level or rate of inflation as a function of  the money supply or the rate of change in the money supply. 
 
 In this lecture, we'll encounter:
 
@@ -59,9 +59,10 @@ In this lecture, we'll encounter:
 
 * a use of eigenvector decompositions of matrices that allowed Blanchard and Khan (1981) :cite:`Blanchard_Khan` and Whiteman (1983) :cite:`Whiteman` to solve a class of linear rational expectations models 
 
+* how to use **SymPy** to get analytical formulas for some key objects comprising a rational expectations equilibrium
 
-Cagan's model with rational expectations
-is formulated as an **expectational difference equation** whose solution is a rational expectations equilibrium.
+We formulate a version of  Cagan's model under  rational expectations
+as an **expectational difference equation** whose solution is a rational expectations equilibrium.
 
 We'll start this lecture with a quick review of deterministic (i.e., non-random)
 first-order and second-order linear difference equations.
@@ -71,28 +72,26 @@ first-order and second-order linear difference equations.
 Linear difference equations
 =============================
 
-In this quick review of linear difference equations, we'll use the *backward shift* or *lag* operator :math:`L`
+We'll use the *backward shift* or *lag* operator :math:`L`.
 
 The lag operator :math:`L`  maps a sequence :math:`\{x_t\}_{t=0}^\infty` into the sequence :math:`\{x_{t-1}\}_{t=0}^\infty`
 
-We'll can use  :math:`L` in linear difference equations by using the equality  
+We'll deploy  :math:`L`  by using the equality  
 :math:`L x_t \equiv x_{t-1}` in algebraic expressions.
 
 Further,  the inverse :math:`L^{-1}` of the lag operator is  the *forward shift*
 operator.
 
-In linear difference equations, we'll often use the equaltiy  :math:`L^{-1} x_t \equiv x_{t+1}` in the the algebra
-below.
+We'll often use the equality  :math:`L^{-1} x_t \equiv x_{t+1}` below.
 
-The algebra of lag and forward shift operators often simplifies formulas for linear difference equations and their
-solutions.
+The algebra of lag and forward shift operators can simplify representing and solving linear difference equations.
 
 First order
 -----------
 
 We want to solve a linear first-order scalar difference equation.
 
-First, let :math:`|\lambda | < 1`, and let
+Let :math:`|\lambda | < 1` and let
 :math:`\{u_t\}_{t=-\infty}^\infty` be a bounded sequence of scalar real
 numbers.
 
@@ -156,14 +155,14 @@ To verify that this is a solution, check the consequences of operating
 on both sides of equation :eq:`equn_5` by :math:`(1 -\lambda L)` and compare to
 equation :eq:`equn_1`.
 
-Solution :eq:`equn_2` exists for :math:`|\lambda | < 1` because
-the distributed lag in :math:`u` converges.
+For any bounded :math:`\{u_t\}` sequence, solution :eq:`equn_2` exists for :math:`|\lambda | < 1` because
+the **distributed lag** in :math:`u` converges.
 
 Solution :eq:`equn_5` exists when :math:`|\lambda| > 1` because the **distributed
 lead** in :math:`u` converges.
 
 When :math:`|\lambda | > 1`, the distributed lag in :math:`u` in :eq:`equn_2` may
-diverge, so that a solution of this form does not exist.
+diverge, in which case a solution of this form does not exist.
 
 The distributed lead in :math:`u` in :eq:`equn_5` need not
 converge when :math:`|\lambda| < 1`.
@@ -196,14 +195,14 @@ or
   y_{t+1} = \lambda_1 y_t - \lambda_2^{-1} \sum_{j=0}^\infty \lambda_2^{-j} u_{t+j+1} .  
 
 Thus, we obtained equation :eq:`equn_7` by
-solving stable roots (in this case :math:`\lambda_1`) **backward**, and
-unstable roots (in this case :math:`\lambda_2`) **forward**.
+solving a stable root (in this case :math:`\lambda_1`) **backward**, and an
+unstable root (in this case :math:`\lambda_2`) **forward**.
 
 Equation :eq:`equn_7` has a form that we shall encounter often.
 
-:math:`\lambda_1 y_t` is called the **feedback part** and
-:math:`-{\frac{\lambda_2^{-1}}{1 - \lambda_2^{-1}L^{-1}}} u_{t+1}` is
-called the **feedforward part** of the solution.
+* :math:`\lambda_1 y_t` is called the **feedback part** 
+
+* :math:`-{\frac{\lambda_2^{-1}}{1 - \lambda_2^{-1}L^{-1}}} u_{t+1}` is called the **feedforward part** 
 
 
 
@@ -212,6 +211,11 @@ called the **feedforward part** of the solution.
 Illustration: Cagan's Model
 =============================
 
+Now let's use linear difference equations to represent and solve Sargent's  :cite:`Sargent77hyper` rational expectations version of
+Cagan’s model :cite:`Cagan` that connects the price level to the public's anticipations of future money supplies. 
+
+Cagan did not use a rational expectations version of his model, but Sargent :cite:`Sargent77hyper`
+
 Let
 
 -  :math:`m_t^d` be the log of the demand for money
@@ -240,6 +244,9 @@ the money supply :math:`m_t` by
 
 where :math:`\lambda \equiv \frac{\beta}{1+\beta} \in (0,1)`.
 
+(We note that the characteristic polynomial if :math:`1 - \lambda^{-1} z^{-1} = 0` so that the zero of the
+characteristic polynomial in this case is :math:`\lambda \in (0,1)` which here is **inside** the unit circle.)
+
 Solving the first order difference equation :eq:`equation_1` forward gives
 
 .. math::
@@ -259,6 +266,8 @@ that is indexed by the real number :math:`c \in {\bf R}`.
 
 Because we want to focus on stable solutions, we set :math:`c=0`.
 
+Equation :eq:`equation_1a` attributes  **perfect foresight** about the money supply sequence to the holders of real balances.
+
 We begin by assuming that the log of the money supply is **exogenous**
 in the sense that it is an autonomous process that does not feed back on
 the log of the price level.
@@ -281,7 +290,7 @@ absolute values, and :math:`G` is a :math:`1 \times n` selector matrix.
 Variables appearing in the vector :math:`x_t` contain information that
 might help predict future values of the money supply.
 
-We’ll take an example in which :math:`x_t` includes only :math:`m_t`,
+We’ll start with an example in which :math:`x_t` includes only :math:`m_t`,
 possibly lagged values of :math:`m`, and a constant.
 
 An example of such an :math:`\{m_t\}` process that fits info state space
@@ -292,7 +301,9 @@ equation
 
 where the zeros of the characteristic polynomial
 :math:`(1 - \rho_1 z - \rho_2 z^2)` are strictly greater than :math:`1`
-in modulus
+in modulus.
+
+(Please see  :doc:`this<samuelson>` QuantEcon lecture for more about characteristic polynomials and their role in solving linear difference equations.)
 
 We seek a stable or non-explosive solution of the difference equation :eq:`equation_1` that
 obeys the system comprised of :eq:`equation_1`-:eq:`equation_3`.
@@ -300,7 +311,7 @@ obeys the system comprised of :eq:`equation_1`-:eq:`equation_3`.
 By stable or non-explosive, we mean that neither :math:`m_t` nor :math:`p_t`
 diverges as :math:`t \rightarrow + \infty`.
 
-This means that we are shutting down the term :math:`c \lambda^{-t}` in equation :eq:`equation_1a` above by setting :math:`c=0`
+This requires  that we  shut down the term :math:`c \lambda^{-t}` in equation :eq:`equation_1a` above by setting :math:`c=0`
 
 The solution we are after is
 
@@ -361,8 +372,9 @@ Here is Python code
                   [0,  1,  0]])
     G = np.array([[0, 1, 0]])
 
-The matrix :math:`A` has one eigenvalue equal to unity that is 
-associated with the :math:`A_{11}` component that captures a
+The matrix :math:`A` has one eigenvalue equal to unity.
+
+It is associated with the :math:`A_{11}` component that captures a
 constant component of the state :math:`x_t`.
 
 We can verify that the two eigenvalues of :math:`A` not associated with
@@ -378,7 +390,7 @@ modulus.
 
     (abs(eigvals) <= 1).all()
 
-Now let’s compute :math:`F` in formulas :eq:`equation_4` and :eq:`equation_5`
+Now let’s compute :math:`F` in formulas :eq:`equation_4` and :eq:`equation_5`.
 
 .. code-block:: python3
 
@@ -426,7 +438,7 @@ initial value :math:`x_0`.
 In the above graph, why is the log of the price level always less than
 the log of the money supply?
 
-The answer is because
+Because
 
 -  according to equation :eq:`equation_2`, :math:`p_t` is a geometric weighted
    average of current and future values of :math:`m_t`, and
@@ -493,18 +505,18 @@ become
 
 .. math::  F = (1-\lambda) (1 -\lambda \rho)^{-1} .
 
-and the log the log price level satisfies
+so that the log the log price level satisfies
 
 .. math::  p_t = F m_t .
 
 Please keep these formulas in mind as we investigate an alternative
-route to and interpretation of the formula for :math:`F`.
+route to and interpretation of our formula for :math:`F`.
 
 Another perspective
 ===================
 
 Above, we imposed stability or non-explosiveness on the solution of the key difference equation :eq:`equation_1`
-in Cagan's model by solving the  unstable root :math:`\lambda^{-1}` forward.  
+in Cagan's model by solving the  unstable root of the characteristic polynomial forward.  
 
 To shed light on the mechanics involved in imposing stability on a
 solution of a potentially unstable system of linear difference equations
@@ -536,18 +548,18 @@ Transition matrix :math:`H` has eigenvalues :math:`\rho \in (0,1)` and
 
 Because an eigenvalue of :math:`H` exceeds unity, if we iterate on
 equation :eq:`equation_9` starting from an arbitrary initial vector
-:math:`y_0 = \begin{bmatrix} m_0 \\ p_0 \end{bmatrix}`, we discover that
+:math:`y_0 = \begin{bmatrix} m_0 \\ p_0 \end{bmatrix}` with :math:`m_0 >0, p_0 >0`, we discover that
 in general absolute values of both components of :math:`y_t` diverge
 toward :math:`+\infty` as :math:`t \rightarrow + \infty`.
 
-To substantiate this claim, we can use the eigenector matrix
+To substantiate this claim, we can use the eigenvector matrix
 decomposition of :math:`H` that is available to us because the
 eigenvalues of :math:`H` are distinct
 
 .. math::  H = Q \Lambda Q^{-1} .
 
 Here :math:`\Lambda` is a diagonal matrix of eigenvalues of :math:`H`
-and :math:`Q` is a matrix whose columns are eigenvectors of the
+and :math:`Q` is a matrix whose columns are eigenvectors associated with the
 corresponding eigenvalues.
 
 Note that
@@ -562,7 +574,7 @@ For almost all initial vectors :math:`y_0`, the presence of the
 eigenvalue :math:`\lambda^{-1} > 1` causes both components of
 :math:`y_t` to diverge in absolute value to :math:`+\infty`.
 
-To explore this outcome in more detail, we use the following
+To explore this outcome in more detail, we can use the following
 transformation
 
 .. math::  y^*_t = Q^{-1} y_t
@@ -577,7 +589,7 @@ Staring at this equation indicates that unless
 .. math::
   :label: equation_11
 
-    y^*_0 = \begin{bmatrix} y^*_{1,0} \cr 0 \end{bmatrix} ,
+    y^*_0 = \begin{bmatrix} y^*_{1,0} \cr 0 \end{bmatrix} 
 
 the path of :math:`y^*_t` and therefore the paths of both components of
 :math:`y_t = Q y^*_t` will diverge in absolute value as
@@ -594,12 +606,11 @@ that
 
 But note that since
 :math:`y_0 = \begin{bmatrix} m_0 \cr p_0 \end{bmatrix}` and :math:`m_0`
-is given to us an an initial condition, it has to be :math:`p_0` that
-does all the adjusting to satisfy this equation.
+is given to us an initial condition,  :math:`p_0` has to do all the adjusting to satisfy this equation.
 
 Sometimes this situation is described by saying that while :math:`m_0`
 is truly a **state** variable, :math:`p_0` is a **jump** variable that
-is free to adjust at :math:`t=0` in order to satisfy the equation.
+must adjust at :math:`t=0` in order to satisfy the equation.
 
 Thus, in a nutshell the unique value of the vector :math:`y_0` for which
 the paths of :math:`y_t` do not diverge must have second component
@@ -623,14 +634,14 @@ restriction that is equivalent to
 where :math:`Q^{ij}` denotes the :math:`(i,j)` component of
 :math:`Q^{-1}`.
 
-Solving this equation for :math:`p_0` we find
+Solving this equation for :math:`p_0`, we find
 
 .. math::
   :label: equation_13
 
     p_0 = - (Q^{22})^{-1} Q^{21} m_0.
 
-This is the unique **stabilizing value** of :math:`p_0` as a function of
+This is the unique **stabilizing value** of :math:`p_0` expressed as a function of
 :math:`m_0`.
 
 Refining the formula
@@ -661,7 +672,7 @@ So we can write
 
     p_0 = Q_{21} Q_{11}^{-1} m_0 .
 
-It can be verified that this formula replicates itself over time so that
+It can be verified that this formula replicates itself over time in the sense  that
 
 .. math::
   :label: equation_15
@@ -681,7 +692,7 @@ stable eigenvalue :math:`\rho` is proportional to
 Notice that if we set :math:`A=\rho` and :math:`G=1` in our earlier
 formula for :math:`p_t` we get
 
-.. math::  Q = G (I - \lambda A)^{-1} m_t =  (1-\lambda) (1 - \lambda \rho)^{-1} m_t
+.. math::  p_t = G (I - \lambda A)^{-1} m_t =  (1-\lambda) (1 - \lambda \rho)^{-1} m_t ,
 
 a formula that is equivalent with
 
@@ -695,7 +706,7 @@ Some remarks about feedback
 ---------------------------
 
 We have expressed :eq:`equation_8` in what superficially appears to be a form in
-which :math:`y_{t+1}` feeds back on :math:`y_t`. even though what we
+which :math:`y_{t+1}` feeds back on :math:`y_t`, even though what we
 actually want to represent is that the component :math:`p_t` feeds
 **forward** on :math:`p_{t+1}`, and through it, on future
 :math:`m_{t+j}`, :math:`j = 0, 1, 2, \ldots`.
@@ -717,9 +728,8 @@ level.
 Log money supply feeds back on log price level
 ==============================================
 
-The same pattern of eigenvalues splitting around unity, with one being
-below unity and another greater than unity, sometimes continues to
-prevail when there is  *feedback* from the log price level to the log
+An arrangement of eigenvalues that split around unity, with one being
+below unity and another being greater than unity, sometimes prevails when there is  *feedback* from the log price level to the log
 money supply.
 
 Let the feedback rule be
@@ -729,20 +739,18 @@ Let the feedback rule be
 
     m_{t+1} =  \rho m_t + \delta p_t
 
-where :math:`\rho \in (0,1)` as before and where we shall now allow
+where :math:`\rho \in (0,1)`  and where we shall now allow
 :math:`\delta \neq 0`.
 
-However, 
-:math:`\delta` cannot be too large if things are to fit together as we
-wish to deliver a stable system for some initial value :math:`p_0` that we want to determine uniquely.
-.
+**Warning:**  If things are to fit together as we
+wish to deliver a stable system for some initial value :math:`p_0` that we want to determine uniquely, :math:`\delta` cannot be too large.
 
 The forward-looking equation :eq:`equation_1` continues to describe equality between
 the demand and supply of money.
 
 We assume that equations :eq:`equation_1` and :eq:`equation_16` govern
 :math:`y_t \equiv \begin{bmatrix} m_t \cr p_t \end{bmatrix}` for
-:math:`t \geq 0`
+:math:`t \geq 0`.
 
 The transition matrix :math:`H` in the law of motion
 
@@ -752,16 +760,16 @@ now becomes
 
 .. math::  H = \begin{bmatrix} \rho & \delta \\ - (1-\lambda)/\lambda & \lambda^{-1}  \end{bmatrix} .
 
-We take :math:`m_0` as a given intial condition and as before seek an
+We take :math:`m_0` as a given initial condition and as before seek an
 initial value :math:`p_0` that stabilizes the system in the sense that
 :math:`y_t` converges as :math:`t \rightarrow + \infty`.
 
-Our approach is identical with that followed above and is based on an
+Our approach is identical with the one  followed above and is based on an
 eigenvalue decomposition in which, cross our fingers, one eigenvalue
 exceeds unity and the other is less than unity in absolute value.
 
 When :math:`\delta \neq 0` as we now assume, the eigenvalues of
-:math:`H` are no longer :math:`\rho \in (0,1)` and
+:math:`H` will no longer be :math:`\rho \in (0,1)` and
 :math:`\lambda^{-1} > 1`
 
 We’ll just calculate them and apply the same algorithm that we used
@@ -802,7 +810,7 @@ Let’s write and execute some Python code that will let us explore how outcomes
 
     H_eigvals()
 
-Notice that a negative δ will not imperil the stability of the matrix
+Notice that a negative :math:`\delta` will not imperil the stability of the matrix
 :math:`H`, even if it has a big absolute value.
 
 .. code-block:: python3
@@ -815,14 +823,14 @@ Notice that a negative δ will not imperil the stability of the matrix
     # large negative δ
     H_eigvals(δ=-1.5)
 
-A sufficiently small positive δ also causes no problem.
+A sufficiently small positive :math:`\delta` also causes no problem.
 
 .. code-block:: python3
 
     # sufficiently small positive δ
     H_eigvals(δ=0.05)
 
-But a large enough positive δ makes both eigenvalues of :math:`H`
+But a large enough positive :math:`\delta` makes both eigenvalues of :math:`H`
 strictly greater than unity in modulus.
 
 For example,
@@ -833,7 +841,9 @@ For example,
 
 We want to study systems in which one eigenvalue exceeds unity in
 modulus while the other is less than unity in modulus, so we avoid
-values of :math:`\delta` that are too large
+values of :math:`\delta` that are too.
+
+That is, we want to avoid too much positive feedback from :math:`p_t` to :math:`m_{t+1}`.  
 
 .. code-block:: python3
 
@@ -900,8 +910,8 @@ exist.
 Big :math:`P`, little :math:`p` interpretation
 ===============================================
 
-It is helpful to view our solutions with feedback from the price level or inflation to money or the rate of money 
-creation in terms of the Big :math:`K`, little :math:`k` idea discussed in :doc:`Rational Expectations Models<rational_expectations>`
+It is helpful to view our solutions of difference equations having  feedback from the price level or inflation to money or the rate of money 
+creation in terms of the Big :math:`K`, little :math:`k` idea discussed in :doc:`Rational Expectations Models<rational_expectations>`.
 
 This will help us sort out what is taken as given by the decision makers who use the 
 difference equation :eq:`equation_2` to determine :math:`p_t` as a function of their forecasts of future values of
@@ -909,11 +919,11 @@ difference equation :eq:`equation_2` to determine :math:`p_t` as a function of t
 
 
 Let's write the stabilizing solution that we have computed using the eigenvector decomposition of :math:`H` as 
-:math:`P_t = F^* m_t` where
+:math:`P_t = F^* m_t`, where
 
 .. math:: 
 
-    F^* = Q_{21} Q_{11}^{-1} 
+    F^* = Q_{21} Q_{11}^{-1} .
 
 Then from :math:`P_{t+1} = F^* m_{t+1}` and :math:`m_{t+1} = \rho m_t + \delta P_t` we can deduce the recursion :math:`P_{t+1} = F^* \rho m_t + F^* \delta P_t` and create the stacked system
 
@@ -930,7 +940,7 @@ or
 
 where :math:`x_t = \begin{bmatrix} m_t \cr P_t \end{bmatrix}`. 
 
-Then apply formula :eq:`equation_5` for :math:`F` to deduce that 
+Apply formula :eq:`equation_5` for :math:`F` to deduce that 
 
 .. math::
 
@@ -942,14 +952,14 @@ which implies that
 
     p_t = \begin{bmatrix} F_1 & F_2 \end{bmatrix}    \begin{bmatrix} m_t \cr F^* m_t \end{bmatrix} = F_1 m_t + F_2 F^* m_t
 
-so that we expect to have
+so that we can anticipate that 
 
 .. math::
 
      F^* = F_1 + F_2 F^*
 
 
-We verify this equality in the next block of Python code that implements the following
+We shall verify this equality in the next block of Python code that implements the following
 computations. 
 
 1. For the system with :math:`\delta\neq 0` so that there is feedback,
@@ -965,7 +975,7 @@ computations.
    from equation :eq:`equation_5` above.
 
 3. We compute :math:`F_1 +  F_2 F^*` and compare it
-   with :math:`F^*` and verify equality.
+   with :math:`F^*` and check for the anticipated  equality.
 
 .. code-block:: python3
 
@@ -1003,16 +1013,16 @@ Compare :math:`F^*` with :math:`F_1 + F_2 F^*`
     F_check[0] + F_check[1] * F_star, F_star
 
 
-Fun with Sympy code
+Fun with SymPy code
 =========================
 
-This section is a small gift for readers who have made it this far.  
+This section is a  gift for readers who have made it this far.  
 
-It puts Sympy to work on our model.
+It puts SymPy to work on our model.
 
-Thus, we  use Sympy to compute some of the key objects comprising the eigenvector decomposition of :math:`H`.
+Thus, we  use Sympy to compute some  key objects comprising the eigenvector decomposition of :math:`H`.
 
-:math:`H` with nonzero :math:`\delta`.
+We start by generating an :math:`H` with nonzero :math:`\delta`.
 
 .. code-block:: python3
 
@@ -1034,7 +1044,7 @@ Thus, we  use Sympy to compute some of the key objects comprising the eigenvecto
 
     H1.eigenvects()
 
-:math:`H` with :math:`\delta` being zero.
+Now let's compute :math:`H` when  :math:`\delta` is zero.
 
 .. code-block:: python3
 
@@ -1052,19 +1062,19 @@ Thus, we  use Sympy to compute some of the key objects comprising the eigenvecto
 
     H2.eigenvects()
 
-Below we do induce sympy to do the following fun things for us analytically:
+Below we do induce SymPy to do the following fun things for us analytically:
 
 1. We compute the matrix :math:`Q` whose first column is
    the eigenvector associated with :math:`\rho`. and whose second column
    is the eigenvector associated with :math:`\lambda^{-1}`.
 
-2. We use sympy to compute the inverse :math:`Q^{-1}` of :math:`Q`
+2. We use SymPy to compute the inverse :math:`Q^{-1}` of :math:`Q`
    (both in symbols).
 
-3. We use sympy to compute :math:`Q_{21} Q_{11}^{-1}` (in symbols).
+3. We use SymPy to compute :math:`Q_{21} Q_{11}^{-1}` (in symbols).
 
 4. Where :math:`Q^{ij}` denotes the :math:`(i,j)` component of
-   :math:`Q^{-1}`, weighted use sympy to compute
+   :math:`Q^{-1}`, we use SymPy to compute
    :math:`- (Q^{22})^{-1} Q^{21}` (again in symbols)
 
 
diff --git a/source/rst/samuelson.rst b/source/rst/samuelson.rst
index 2fae990..2d2acc3 100644
--- a/source/rst/samuelson.rst
+++ b/source/rst/samuelson.rst
@@ -14,7 +14,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 Overview
@@ -60,7 +60,7 @@ Samuelson's Model
 Samuelson used a *second-order linear difference equation* to
 represent a model of national output based on three components:
 
--  a *national output identity* asserting that national outcome is the
+-  a *national output identity* asserting that national output or national income is the
    sum of consumption plus investment plus government purchases.
 
 -  a Keynesian *consumption function* asserting that consumption at
@@ -71,11 +71,11 @@ represent a model of national output based on three components:
    times the difference in output between period :math:`t-1` and
    :math:`t-2`.
 
--  the idea that consumption plus investment plus government purchases
-   constitute *aggregate demand,* which automatically calls forth an
-   equal amount of *aggregate supply*.
+Consumption plus investment plus government purchases
+constitute *aggregate demand,* which automatically calls forth an
+equal amount of *aggregate supply*.
 
-(To read about linear difference equations see `here <https://en.wikipedia.org/wiki/Linear\_difference\_equation>`__ or chapter IX of :cite:`Sargent1987`)
+(To read about linear difference equations see `here <https://en.wikipedia.org/wiki/Linear\_difference\_equation>`__ or chapter IX of :cite:`Sargent1987`.)
 
 
 
@@ -102,7 +102,7 @@ gives rise to recurrent irregular business cycles.
 
 
 (To read about stochastic linear difference equations see chapter XI of
-:cite:`Sargent1987`)
+:cite:`Sargent1987`.)
 
 
 Details
@@ -126,7 +126,7 @@ Let's assume that
    consumption function :math:`C_t = a Y_{t-1} + \gamma`.
 
 -  :math:`b` is the "accelerator coefficient" in the "investment
-   accelerator" :math:`I\_t = b (Y\_{t-1} - Y\_{t-2})`.
+   accelerator" :math:`I_t = b (Y_{t-1} - Y_{t-2})`.
 
 -  :math:`\{\epsilon_{t}\}` is an IID sequence standard normal random variables.
 
@@ -157,7 +157,7 @@ and the national income identity
 
 -  The parameter :math:`a` is peoples' *marginal propensity to consume*
    out of income - equation :eq:`consumption` asserts that people consume a fraction of
-   math:`a \in (0,1)` of each additional dollar of income.
+   :math:`a \in (0,1)` of each additional dollar of income.
 
 -  The parameter :math:`b > 0` is the investment accelerator coefficient - equation
    :eq:`accelerator` asserts that people invest in physical capital when
@@ -186,7 +186,7 @@ require initial values
 
 We'll ordinarily set the parameters :math:`(a,b)` so that starting from
 an arbitrary pair of initial conditions
-:math:`(\bar Y_{-1}, \bar Y_{-2})`, national income :math:`Y\_t` converges to
+:math:`(\bar Y_{-1}, \bar Y_{-2})`, national income :math:`Y_t` converges to
 a constant value as :math:`t` becomes large.
 
 We are interested in studying
@@ -276,7 +276,7 @@ These can also be represented as
 .. math::  \lambda_2 = r (cos (\omega) - i \sin(\omega))
 
 (To read about the polar form, see
-`here <https://www.varsitytutors.com/hotmath/hotmath\_help/topics/polar-form-of-a-complex-number>`__)
+`here <https://www.khanacademy.org/math/precalculus/x9e81a4f98389efdf:complex/x9e81a4f98389efdf:complex-mul-div-polar/a/complex-number-polar-form-review>`__)
 
 Given **initial conditions** :math:`Y_{-1}, Y_{-2}`, we want to generate
 a **solution** of the difference equation :eq:`second_stochastic2`.
@@ -785,14 +785,6 @@ We can also use sympy to compute analytic formulas for the roots
 
     sympy.solve(z**2 - r1*z - r2, z)
 
-
-.. math::
-
-    \left [ \frac{\rho_{1}}{2} - \frac{1}{2} \sqrt{\rho_{1}^{2} + 4 \rho_{2}},
-    \quad \frac{\rho_{1}}{2} + \frac{1}{2} \sqrt{\rho_{1}^{2} + 4 \rho_{2}}\right ]
-
-
-
 .. code-block:: python3
 
     a = Symbol("α")
@@ -802,13 +794,6 @@ We can also use sympy to compute analytic formulas for the roots
 
     sympy.solve(z**2 - r1*z - r2, z)
 
-.. math::
-
-    \left [ \frac{\alpha}{2} + \frac{\beta}{2} - \frac{1}{2} \sqrt{\alpha^{2} +
-    2 \alpha \beta + \beta^{2} - 4 \beta}, \quad \frac{\alpha}{2} +
-    \frac{\beta}{2} + \frac{1}{2} \sqrt{\alpha^{2} + 2 \alpha \beta +
-    \beta^{2} - 4 \beta}\right ]
-
 
 
 Stochastic Shocks
@@ -1295,14 +1280,6 @@ Samuelson model using a method in the ``LinearStateSpace`` class
     y1.shape
 
 
-
-
-.. math::
-
-    \left ( 2, \quad 6, \quad 1\right )
-
-
-
 Now let's compute the zeros of the characteristic polynomial by simply
 calculating the eigenvalues of :math:`A`
 
diff --git a/source/rst/scalar_dynam.rst b/source/rst/scalar_dynam.rst
index 1ce8efb..5a9b268 100644
--- a/source/rst/scalar_dynam.rst
+++ b/source/rst/scalar_dynam.rst
@@ -129,7 +129,7 @@ This made analysis of dynamics very easy.
 
 When models are nonlinear, however, the situation can be quite different.
 
-For example, recall how we `previously studied <http://python-programming.quantecon.org/python_oop.html#Example:-The-Solow-Growth-Model>`__ the law of motion for the Solow growth model, a simplified version of which is
+For example, recall how we `previously studied <https://python-programming.quantecon.org/python_oop.html#example-the-solow-growth-model>`__ the law of motion for the Solow growth model, a simplified version of which is
 
 .. math::
     :label: solow_lom2
diff --git a/source/rst/short_path.rst b/source/rst/short_path.rst
index 729f4be..b9c63e3 100644
--- a/source/rst/short_path.rst
+++ b/source/rst/short_path.rst
@@ -60,7 +60,7 @@ We wish to travel from node (vertex) A to node G at minimum cost
 * Arrows (edges) indicate the movements we can take.
 * Numbers on edges indicate the cost of traveling that edge.
 
-(Graphs such as the one above are called **weighted directed graphs**)
+(Graphs such as the one above are called weighted `directed graphs <https://en.wikipedia.org/wiki/Directed_graph>`_.)
 
 Possible interpretations of the graph include
 
diff --git a/source/rst/sir_model.rst b/source/rst/sir_model.rst
index a1ac756..66027ec 100644
--- a/source/rst/sir_model.rst
+++ b/source/rst/sir_model.rst
@@ -283,7 +283,7 @@ As expected, lower effective transmission rates defer the peak of infections.
 
 They also lead to a lower peak in current cases.
 
-Here is cumulative cases, as a fraction of population:
+Here are cumulative cases, as a fraction of population:
 
 .. code-block:: ipython3
 
@@ -344,7 +344,7 @@ Let's calculate the time path of infected people:
         c_paths.append(c_path)
 
 
-This is current cases under the different scenarios:
+These are current cases under the different scenarios:
 
 .. code-block:: ipython3
 
diff --git a/source/rst/time_series_with_matrices.rst b/source/rst/time_series_with_matrices.rst
new file mode 100644
index 0000000..bc27c39
--- /dev/null
+++ b/source/rst/time_series_with_matrices.rst
@@ -0,0 +1,427 @@
+.. _time_series_with_matrices:
+
+.. include:: /_static/includes/header.raw
+
+.. highlight:: python3
+
+********************************************
+Univariate Time Series with Matrix Algebra
+********************************************
+
+.. contents:: :depth: 2
+
+
+
+Overview
+========
+
+This lecture uses matrices to solve some linear difference equations.
+
+As a running example, we’ll study a **second-order linear difference
+equation** that was the key technical tool in Paul Samuelson’s 1939
+article :cite:`Samuelson1939` that introduced the **multiplier-accelerator** model.
+
+This model became the workhorse that powered early econometric versions of
+Keynesian macroeconomic models in the United States.  
+
+You can read about the details of that model in :doc:`this<samuelson>`
+QuantEcon lecture.
+
+(That lecture also describes some technicalities about second-order linear difference equations.)
+
+We'll also study a "perfect foresight" model of stock prices that involves solving
+a "forward-looking" linear difference equation.
+
+We will use the following imports:
+
+.. code-block:: ipython
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+    %matplotlib inline
+
+Samuelson's model
+==================
+
+
+Let :math:`t = 0, \pm 1, \pm 2, \ldots` index time.
+
+For :math:`t = 1, 2, 3, \ldots, T` suppose that
+
+.. math::
+    :label: tswm_1
+
+
+     y_{t} = \alpha_{0} + \alpha_{1} y_{t-1} + \alpha_{2} y_{t-2}
+
+
+where we assume that :math:`y_0` and :math:`y_{-1}` are given numbers
+that we take as **initial conditions**.
+
+In Samuelson's model, :math:`y_t` stood for **national income** or perhaps a different
+measure of aggregate activity called **gross domestic product** (GDP) at time :math:`t`.
+
+Equation :eq:`tswm_1` is called a **second-order linear difference equation**.
+
+But actually, it is a collection of :math:`T` simultaneous linear
+equations in the :math:`T` variables :math:`y_1, y_2, \ldots, y_T`.
+
+**Note:** To be able to solve a second-order linear difference
+equation, we require two **boundary conditions** that can take the form
+either of two **initial conditions** or two **terminal conditions** or
+possibly one of each.
+
+Let’s write our equations as a stacked system
+
+.. math::
+
+
+   \underset{\equiv A}{\underbrace{\left[\begin{array}{cccccccc}
+   1 & 0 & 0 & 0 & \cdots & 0 & 0 & 0\\
+   -\alpha_{1} & 1 & 0 & 0 & \cdots & 0 & 0 & 0\\
+   -\alpha_{2} & -\alpha_{1} & 1 & 0 & \cdots & 0 & 0 & 0\\
+   0 & -\alpha_{2} & -\alpha_{1} & 1 & \cdots & 0 & 0 & 0\\
+   \vdots & \vdots & \vdots & \vdots & \cdots & \vdots & \vdots & \vdots\\
+   0 & 0 & 0 & 0 & \cdots & -\alpha_{2} & -\alpha_{1} & 1
+   \end{array}\right]}}\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   y_{3}\\
+   y_{4}\\
+   \vdots\\
+   y_{T}
+   \end{array}\right]=\underset{\equiv b}{\underbrace{\left[\begin{array}{c}
+   \alpha_{0}+\alpha_{1}y_{0}+\alpha_{2}y_{-1}\\
+   \alpha_{0}+\alpha_{2}y_{0}\\
+   \alpha_{0}\\
+   \alpha_{0}\\
+   \vdots\\
+   \alpha_{0}
+   \end{array}\right]}}
+
+or
+
+.. math::  A y = b 
+
+where
+
+.. math::  y = \begin{bmatrix} y_1 \cr y_2 \cr \cdots \cr y_T \end{bmatrix} 
+
+Evidently :math:`y` can be computed from
+
+.. math::
+
+
+   y = A^{-1} b
+
+The vector :math:`y` is a complete time path :math:`\{y_t\}_{t=1}^T`.
+
+Let’s put Python to work on an example that captures the flavor of
+Samuelson’s multiplier-accelerator model.
+
+We'll set parameters equal to the same values we used in :doc:`this QuantEcon lecture<samuelson>`.
+
+.. code-block:: python3
+
+    T = 80
+    
+    # parameters
+    𝛼0 = 10.0
+    𝛼1 = 1.53
+    𝛼2 = -.9
+    
+    y_1 = 28. # y_{-1}
+    y0 = 24.
+
+.. code-block:: python3
+
+    # construct A and b
+    A = np.zeros((T, T))
+    
+    for i in range(T):
+        A[i, i] = 1
+    
+        if i-1 >= 0:
+            A[i, i-1] = -𝛼1
+    
+        if i-2 >= 0:
+            A[i, i-2] = -𝛼2
+    
+    b = np.ones(T) * 𝛼0
+    b[0] = 𝛼0 + 𝛼1 * y0 + 𝛼2 * y_1
+    b[1] = 𝛼0 + 𝛼2 * y0
+
+Let’s look at the matrix :math:`A` and the vector :math:`b` for our
+example.
+
+.. code-block:: python3
+
+    A, b
+
+Now let’s solve for the path of :math:`y`.
+
+If :math:`y_t` is GNP at time :math:`t`, then we have a version of
+Samuelson’s model of the dynamics for GNP.
+
+.. code-block:: python3
+
+    A_inv = np.linalg.inv(A)
+    
+    y = A_inv @ b
+
+.. code-block:: python3
+
+    plt.plot(np.arange(T)+1, y)
+    plt.xlabel('t')
+    plt.ylabel('y')
+    
+    plt.show()
+
+If we set both initial values at the **steady state** value of :math:`y_t`, namely, 
+
+.. math::
+
+
+   y_{0} = y_{-1} = \frac{\alpha_{0}}{1 - \alpha_{1} - \alpha_{2}}
+
+then :math:`y_{t}` will be constant
+
+.. code-block:: python3
+
+    y_1_steady = 𝛼0 / (1 - 𝛼1 - 𝛼2) # y_{-1}
+    y0_steady = 𝛼0 / (1 - 𝛼1 - 𝛼2)
+    
+    b_steady = np.ones(T) * 𝛼0
+    b_steady[0] = 𝛼0 + 𝛼1 * y0_steady + 𝛼2 * y_1_steady
+    b_steady[1] = 𝛼0 + 𝛼2 * y0_steady
+
+.. code-block:: python3
+
+    y_steady = A_inv @ b_steady
+
+.. code-block:: python3
+
+    plt.plot(np.arange(T)+1, y_steady)
+    plt.xlabel('t')
+    plt.ylabel('y')
+    
+    plt.show()
+
+Adding a random term
+=====================
+
+To generate some excitement, we'll follow in the spirit of the great economists
+Eugen Slutsky and Ragnar Frisch and replace our original second-order difference
+equation with the following **second-order stochastic linear difference
+equation**:
+
+.. math::
+    :label: tswm_2
+
+
+     y_{t} = \alpha_{0} + \alpha_{1} y_{t-1} + \alpha_{2} y_{t-2} + u_t
+
+
+where :math:`u_{t} \sim N\left(0, \sigma_{u}^{2}\right)` and is IID,
+meaning **independent** and **identically** distributed.
+
+We’ll stack these :math:`T` equations into a system cast in terms of
+matrix algebra.
+
+Let’s define the random vector
+
+.. math::
+
+
+   u=\left[\begin{array}{c}
+   u_{1}\\
+   u_{2}\\
+   \vdots\\
+   u_{T}
+   \end{array}\right]
+
+Where :math:`A, b, y` are defined as above, now assume that :math:`y` is
+governed by the system
+
+.. math::
+
+
+   A y = b + u
+
+The solution for :math:`y` becomes
+
+.. math::
+
+
+   y = A^{-1} \left(b + u\right)
+
+Let’s try it out in Python.
+
+.. code-block:: python3
+
+    𝜎u = 2.
+
+.. code-block:: python3
+
+    u = np.random.normal(0, 𝜎u, size=T)
+    y = A_inv @ (b + u)
+
+.. code-block:: python3
+
+    plt.plot(np.arange(T)+1, y)
+    plt.xlabel('t')
+    plt.ylabel('y')
+    
+    plt.show()
+
+The above time series looks a lot like (detrended) GDP series for a
+number of advanced countries in recent decades.
+
+We can simulate :math:`N` paths.
+
+.. code-block:: python3
+
+    N = 100
+    
+    for i in range(N):
+        u = np.random.normal(0, 𝜎u, size=T)
+        y = A_inv @ (b + u)
+        plt.plot(np.arange(T)+1, y, lw=0.5)
+    
+    plt.xlabel('t')
+    plt.ylabel('y')
+    
+    plt.show()
+
+Also consider the case when :math:`y_{0}` and :math:`y_{-1}` are at
+steady state.
+
+.. code-block:: python3
+
+    N = 100
+    
+    for i in range(N):
+        u = np.random.normal(0, 𝜎u, size=T)
+        y_steady = A_inv @ (b_steady + u)
+        plt.plot(np.arange(T)+1, y_steady, lw=0.5)
+    
+    plt.xlabel('t')
+    plt.ylabel('y')
+    
+    plt.show()
+
+A forward looking model
+=======================
+
+Samuelson’s model is **backwards looking** in the sense that we give it **initial conditions** and let it 
+run.
+
+Let’s now turn to model  that is **forward looking**.
+
+We apply similar linear algebra machinery to study a **perfect
+foresight** model widely used as a benchmark in macroeconomics and
+finance.
+
+As an example, we suppose that :math:`p_t` is the price of a stock and
+that :math:`y_t` is its dividend.
+
+We assume that :math:`y_t` is determined by second-order difference
+equation that we analyzed just above, so that
+
+.. math::
+
+
+   y = A^{-1} \left(b + u\right)
+
+Our **perfect foresight** model of stock prices is
+
+.. math::
+
+
+   p_{t} = \sum_{j=0}^{T-t} \beta^{j} y_{t+j}, \quad \beta \in (0,1)
+
+where :math:`\beta` is a discount factor.
+
+The model asserts that the price of the stock at :math:`t` equals the
+discounted present values of the (perfectly foreseen) future dividends.
+
+Form
+
+.. math::
+
+
+   \underset{\equiv p}{\underbrace{\left[\begin{array}{c}
+   p_{1}\\
+   p_{2}\\
+   p_{3}\\
+   \vdots\\
+   p_{T}
+   \end{array}\right]}}=\underset{\equiv B}{\underbrace{\left[\begin{array}{ccccc}
+   1 & \beta & \beta^{2} & \cdots & \beta^{T-1}\\
+   0 & 1 & \beta & \cdots & \beta^{T-2}\\
+   0 & 0 & 1 & \cdots & \beta^{T-3}\\
+   \vdots & \vdots & \vdots & \vdots & \vdots\\
+   0 & 0 & 0 & \cdots & 1
+   \end{array}\right]}}\left[\begin{array}{c}
+   y_{1}\\
+   y_{2}\\
+   y_{3}\\
+   \vdots\\
+   y_{T}
+   \end{array}\right]
+
+.. code-block:: python3
+
+    𝛽 = .96
+
+
+.. code-block:: python3
+
+    # construct B
+    B = np.zeros((T, T))
+    
+    for i in range(T):
+        B[i, i:] = 𝛽 ** np.arange(0, T-i)
+
+.. code-block:: python3
+
+    B
+
+.. code-block:: python3
+
+    𝜎u = 0.
+    u = np.random.normal(0, 𝜎u, size=T)
+    y = A_inv @ (b + u)
+    y_steady = A_inv @ (b_steady + u)
+
+.. code-block:: python3
+
+    p = B @ y
+
+.. code-block:: python3
+
+    plt.plot(np.arange(0, T)+1, y, label='y')
+    plt.plot(np.arange(0, T)+1, p, label='p')
+    plt.xlabel('t')
+    plt.ylabel('y/p')
+    plt.legend()
+    
+    plt.show()
+
+Can you explain why the trend of the price is downward over time?
+
+Also consider the case when :math:`y_{0}` and :math:`y_{-1}` are at the
+steady state.
+
+.. code-block:: python3
+
+    p_steady = B @ y_steady
+    
+    plt.plot(np.arange(0, T)+1, y_steady, label='y')
+    plt.plot(np.arange(0, T)+1, p_steady, label='p')
+    plt.xlabel('t')
+    plt.ylabel('y/p')
+    plt.legend()
+    
+    plt.show()
+
diff --git a/source/rst/troubleshooting.rst b/source/rst/troubleshooting.rst
index 4343151..c1ba42d 100644
--- a/source/rst/troubleshooting.rst
+++ b/source/rst/troubleshooting.rst
@@ -35,9 +35,9 @@ You also need to keep the external code libraries, such as `QuantEcon.py
 
 For this task you can either
 
-* use `pip install --upgrade quantecon` on the command line, or
+* use `conda install -y quantecon` on the command line, or
 
-* execute `!pip install --upgrade quantecon` within a Jupyter notebook.
+* execute `!pip install quantecon` within a Jupyter notebook.
 
 If your local environment is still not working you can do two things.
 
diff --git a/source/rst/wald_friedman.rst b/source/rst/wald_friedman.rst
index 54135fd..4c24f5b 100644
--- a/source/rst/wald_friedman.rst
+++ b/source/rst/wald_friedman.rst
@@ -22,7 +22,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
   !pip install interpolation
 
 
@@ -40,23 +40,23 @@ In this lecture, we apply dynamic programming algorithms to Friedman and Wallis
 
 Key ideas in play will be:
 
--  Bayes' Law
+- Bayes' Law
 
--  Dynamic programming
+- Dynamic programming
 
--  Type I and type II statistical errors
+- Type I and type II statistical errors
 
-   -  a type I error occurs when you reject a null hypothesis that is true
+  - a type I error occurs when you reject a null hypothesis that is true
 
-   -  a type II error is when you accept a null hypothesis that is false
+  - a type II error is when you accept a null hypothesis that is false
 
--  Abraham Wald's **sequential probability ratio test**
+- Abraham Wald's **sequential probability ratio test**
 
--  The **power** of a statistical test
+- The **power** of a statistical test
 
--  The **critical region** of a statistical test
+- The **critical region** of a statistical test
 
--  A **uniformly most powerful test**
+- A **uniformly most powerful test**
 
 We'll begin with some imports:
 
@@ -64,7 +64,8 @@ We'll begin with some imports:
 
     import numpy as np
     import matplotlib.pyplot as plt
-    from numba import jit, prange, jitclass, float64, int64
+    from numba import jit, prange, float64, int64
+    from numba.experimental import jitclass
     from interpolation import interp
     from math import gamma
 
@@ -140,11 +141,11 @@ random variables is also independently and identically distributed (IID).
 
 But the observer does not know which of the two distributions generated the sequence.
 
-For reasons explained  `Exchangeability and Bayesian Updating <https://python.quantecon.org/exchangeable.html>`__, this means that the sequence is not
+For reasons explained in  `Exchangeability and Bayesian Updating <https://python.quantecon.org/exchangeable.html>`__, this means that the sequence is not
 IID and that the observer has something to learn, even though he knows both :math:`f_0` and :math:`f_1`.
 
-After a number of draws, also to be determined, he makes a decision about
-which of the distributions is generating the draws he observes.
+The decision maker   chooses a number of draws (i.e., random samples from the unknown distribution) and uses them to decide 
+which of the  two distributions is generating outcomes.
 
 He starts with prior
 
@@ -173,13 +174,13 @@ that :math:`z_{k+1}` has probability distribution
 
 .. math::
 
-    f_{{\pi}_k} (v) = \pi_k f_0(v) + (1-\pi_k) f_1 (v)
+    f_{{\pi}_k} (v) = \pi_k f_0(v) + (1-\pi_k) f_1 (v) ,
 
 
-This is a mixture of distributions :math:`f_0` and :math:`f_1`, with the weight
+which  is a mixture of distributions :math:`f_0` and :math:`f_1`, with the weight
 on :math:`f_0` being the posterior probability that :math:`f = f_0` [#f1]_.
 
-To help illustrate this kind of distribution, let's inspect some mixtures of beta distributions.
+To  illustrate such a distribution, let's inspect some mixtures of beta distributions.
 
 The density of a beta probability distribution with parameters :math:`a` and :math:`b` is
 
@@ -233,24 +234,24 @@ After observing :math:`z_k, z_{k-1}, \ldots, z_0`, the decision-maker
 chooses among three distinct actions:
 
 
--  He decides that :math:`f = f_0` and draws no more :math:`z`'s
+- He decides that :math:`f = f_0` and draws no more :math:`z`'s
 
--  He decides that :math:`f = f_1` and draws no more :math:`z`'s
+- He decides that :math:`f = f_1` and draws no more :math:`z`'s
 
--  He postpones deciding now and instead chooses to draw a
-   :math:`z_{k+1}`
+- He postpones deciding now and instead chooses to draw a
+  :math:`z_{k+1}`
 
 Associated with these three actions, the decision-maker can suffer three
 kinds of losses:
 
--  A loss :math:`L_0` if he decides :math:`f = f_0` when actually
-   :math:`f=f_1`
+- A loss :math:`L_0` if he decides :math:`f = f_0` when actually
+  :math:`f=f_1`
 
--  A loss :math:`L_1` if he decides :math:`f = f_1` when actually
-   :math:`f=f_0`
+- A loss :math:`L_1` if he decides :math:`f = f_1` when actually
+  :math:`f=f_0`
 
--  A cost :math:`c` if he postpones deciding and chooses instead to draw
-   another :math:`z`
+- A cost :math:`c` if he postpones deciding and chooses instead to draw
+  another :math:`z`
 
 
 
@@ -266,11 +267,11 @@ then :math:`L_1` and :math:`L_0` are losses associated with two types of statist
 
 So when we treat :math:`f=f_0` as the null hypothesis
 
--  We can think of :math:`L_1` as the loss associated with a type I
-   error.
+- We can think of :math:`L_1` as the loss associated with a type I
+  error.
 
--  We can think of :math:`L_0` as the loss associated with a type II
-   error.
+- We can think of :math:`L_0` as the loss associated with a type II
+  error.
 
 
 
@@ -321,7 +322,7 @@ With some thought, you will agree that :math:`J` should satisfy the Bellman equa
         \right\}
 
 
-where :math:`\pi'` is the random variable defined by
+where :math:`\pi'` is the random variable defined by Bayes' Law
 
 .. math::
 
@@ -352,14 +353,14 @@ We can represent the  Bellman equation as
 
 where :math:`\pi \in [0,1]` and
 
--  :math:`(1-\pi) L_0` is the expected loss associated with accepting
-   :math:`f_0` (i.e., the cost of making a type II error).
+- :math:`(1-\pi) L_0` is the expected loss associated with accepting
+  :math:`f_0` (i.e., the cost of making a type II error).
 
--  :math:`\pi L_1` is the expected loss associated with accepting
-   :math:`f_1` (i.e., the cost of making a type I error).
+- :math:`\pi L_1` is the expected loss associated with accepting
+  :math:`f_1` (i.e., the cost of making a type I error).
 
--  :math:`h(\pi) :=  c + \mathbb E [J(\pi')]` the continuation value; i.e.,
-   the expected cost associated with drawing one more :math:`z`.
+- :math:`h(\pi) :=  c + \mathbb E [J(\pi')]` the continuation value; i.e.,
+  the expected cost associated with drawing one more :math:`z`.
 
 
 
@@ -414,12 +415,12 @@ The equality
 can be understood as a functional equation, where :math:`h` is the unknown.
 
 Using the functional equation, :eq:`funceq`, for the continuation value, we can back out
-optimal choices using the RHS of :eq:`optdec`.
+optimal choices using the right side of :eq:`optdec`.
 
 This functional equation can be solved by taking an initial guess and iterating
-to find the fixed point.
+to find a fixed point.
 
-In other words, we iterate with an operator :math:`Q`, where
+Thus, we iterate with an operator :math:`Q`, where
 
 .. math::
 
@@ -574,7 +575,7 @@ To solve the model, we will iterate using ``Q`` to find the fixed point
 Analysis
 ========
 
-Let's inspect the model's solutions.
+Let's inspect outcomes.
 
 We will be using the default parameterization with distributions like so
 
@@ -585,7 +586,7 @@ We will be using the default parameterization with distributions like so
     fig, ax = plt.subplots(figsize=(10, 6))
     ax.plot(wf.f0(wf.π_grid), label="$f_0$")
     ax.plot(wf.f1(wf.π_grid), label="$f_1$")
-    ax.set(ylabel="probability of $z_k$", xlabel="$k$", title="Distributions")
+    ax.set(ylabel="probability of $z_k$", xlabel="$z_k$", title="Distributions")
     ax.legend()
 
     plt.show()
@@ -789,9 +790,9 @@ We double the cost of drawing an additional observation.
 
 Before you look, think about what will happen:
 
--  Will the decision-maker be correct more or less often?
+- Will the decision-maker be correct more or less often?
 
--  Will he make decisions sooner or later?
+- Will he make decisions sooner or later?
 
 
 .. code-block:: python3
@@ -799,9 +800,9 @@ Before you look, think about what will happen:
     wf = WaldFriedman(c=2.5)
     simulation_plot(wf)
 
-Increased cost per draw has induced the decision-maker to take less draws before deciding.
+Increased cost per draw has induced the decision-maker to take fewer draws before deciding.
 
-Because he decides with less, the percentage of time he is correct drops.
+Because he decides with fewer draws, the percentage of time he is correct drops.
 
 This leads to him having a higher expected loss when he puts equal weight on both models.
 
@@ -851,22 +852,22 @@ We'll rely on Abraham Wald's :cite:`Wald47` elegant summary of Neyman-Pearson th
 
 For our purposes, watch for there features of the setup:
 
--  the assumption of a *fixed* sample size :math:`n`
+- the assumption of a *fixed* sample size :math:`n`
 
--  the application of laws of large numbers, conditioned on alternative
-   probability models, to interpret the probabilities :math:`\alpha` and
-   :math:`\beta` defined in the Neyman-Pearson theory
+- the application of laws of large numbers, conditioned on alternative
+  probability models, to interpret the probabilities :math:`\alpha` and
+  :math:`\beta` defined in the Neyman-Pearson theory
 
 Recall that in the sequential analytic formulation above, that
 
--  The sample size :math:`n` is not fixed but rather an object to be
-   chosen; technically :math:`n` is a random variable.
+- The sample size :math:`n` is not fixed but rather an object to be
+  chosen; technically :math:`n` is a random variable.
 
--  The parameters :math:`\beta` and :math:`\alpha` characterize cut-off
-   rules used to determine :math:`n` as a random variable.
+- The parameters :math:`\beta` and :math:`\alpha` characterize cut-off
+  rules used to determine :math:`n` as a random variable.
 
--  Laws of large numbers make no appearances in the sequential
-   construction.
+- Laws of large numbers make no appearances in the sequential
+  construction.
 
 In chapter 1 of **Sequential Analysis** :cite:`Wald47` Abraham Wald summarizes the
 Neyman-Pearson approach to hypothesis testing.
@@ -880,80 +881,80 @@ problem -- usually, *something* means *a lot*)
 By limiting  what is unknown, Wald uses the following simple structure
 to illustrate the main ideas:
 
--  A decision-maker wants to decide which of two distributions
-   :math:`f_0`, :math:`f_1` govern an IID random variable :math:`z`.
+- A decision-maker wants to decide which of two distributions
+  :math:`f_0`, :math:`f_1` govern an IID random variable :math:`z`.
 
--  The null hypothesis :math:`H_0` is the statement that :math:`f_0`
-   governs the data.
+- The null hypothesis :math:`H_0` is the statement that :math:`f_0`
+  governs the data.
 
--  The alternative hypothesis :math:`H_1` is the statement that
-   :math:`f_1` governs the data.
+- The alternative hypothesis :math:`H_1` is the statement that
+  :math:`f_1` governs the data.
 
--  The problem is to devise and analyze a test of hypothesis
-   :math:`H_0` against the alternative hypothesis :math:`H_1` on the
-   basis of a sample of a fixed number :math:`n` independent
-   observations :math:`z_1, z_2, \ldots, z_n` of the random variable
-   :math:`z`.
+- The problem is to devise and analyze a test of hypothesis
+  :math:`H_0` against the alternative hypothesis :math:`H_1` on the
+  basis of a sample of a fixed number :math:`n` independent
+  observations :math:`z_1, z_2, \ldots, z_n` of the random variable
+  :math:`z`.
 
 To quote Abraham Wald,
 
-   A test procedure leading to the acceptance or rejection of the \[null\]
-   hypothesis in question is simply a rule specifying, for each possible
-   sample of size :math:`n`, whether the \[null\] hypothesis should be accepted
-   or rejected on the basis of the sample. This may also be expressed as
-   follows: A test procedure is simply a subdivision of the totality of
-   all possible samples of size :math:`n` into two mutually exclusive
-   parts, say part 1 and part 2, together with the application of the
-   rule that the \[null\] hypothesis be accepted if the observed sample is
-   contained in part 2. Part 1 is also called the critical region. Since
-   part 2 is the totality of all samples of size :math:`n` which are not
-   included in part 1, part 2 is uniquely determined by part 1. Thus,
-   choosing a test procedure is equivalent to determining a critical
-   region.
+    A test procedure leading to the acceptance or rejection of the \[null\]
+    hypothesis in question is simply a rule specifying, for each possible
+    sample of size :math:`n`, whether the \[null\] hypothesis should be accepted
+    or rejected on the basis of the sample. This may also be expressed as
+    follows: A test procedure is simply a subdivision of the totality of
+    all possible samples of size :math:`n` into two mutually exclusive
+    parts, say part 1 and part 2, together with the application of the
+    rule that the \[null\] hypothesis be accepted if the observed sample is
+    contained in part 2. Part 1 is also called the critical region. Since
+    part 2 is the totality of all samples of size :math:`n` which are not
+    included in part 1, part 2 is uniquely determined by part 1. Thus,
+    choosing a test procedure is equivalent to determining a critical
+    region.
 
 Let's listen to Wald longer:
 
-   As a basis for choosing among critical regions the following
-   considerations have been advanced by Neyman and Pearson: In accepting
-   or rejecting :math:`H_0` we may commit errors of two kinds. We commit
-   an error of the first kind if we reject :math:`H_0` when it is true;
-   we commit an error of the second kind if we accept :math:`H_0` when
-   :math:`H_1` is true. After a particular critical region :math:`W` has
-   been chosen, the probability of committing an error of the first
-   kind, as well as the probability of committing an error of the second
-   kind is uniquely determined. The probability of committing an error
-   of the first kind is equal to the probability, determined by the
-   assumption that :math:`H_0` is true, that the observed sample will be
-   included in the critical region :math:`W`. The probability of
-   committing an error of the second kind is equal to the probability,
-   determined on the assumption that :math:`H_1` is true, that the
-   probability will fall outside the critical region :math:`W`. For any
-   given critical region :math:`W` we shall denote the probability of an
-   error of the first kind by :math:`\alpha` and the probability of an
-   error of the second kind by :math:`\beta`.
+    As a basis for choosing among critical regions the following
+    considerations have been advanced by Neyman and Pearson: In accepting
+    or rejecting :math:`H_0` we may commit errors of two kinds. We commit
+    an error of the first kind if we reject :math:`H_0` when it is true;
+    we commit an error of the second kind if we accept :math:`H_0` when
+    :math:`H_1` is true. After a particular critical region :math:`W` has
+    been chosen, the probability of committing an error of the first
+    kind, as well as the probability of committing an error of the second
+    kind is uniquely determined. The probability of committing an error
+    of the first kind is equal to the probability, determined by the
+    assumption that :math:`H_0` is true, that the observed sample will be
+    included in the critical region :math:`W`. The probability of
+    committing an error of the second kind is equal to the probability,
+    determined on the assumption that :math:`H_1` is true, that the
+    probability will fall outside the critical region :math:`W`. For any
+    given critical region :math:`W` we shall denote the probability of an
+    error of the first kind by :math:`\alpha` and the probability of an
+    error of the second kind by :math:`\beta`.
 
 Let's listen carefully to how Wald applies law of large numbers to
 interpret :math:`\alpha` and :math:`\beta`:
 
-   The probabilities :math:`\alpha` and :math:`\beta` have the
-   following important practical interpretation: Suppose that we draw a
-   large number of samples of size :math:`n`. Let :math:`M` be the
-   number of such samples drawn. Suppose that for each of these
-   :math:`M` samples we reject :math:`H_0` if the sample is included in
-   :math:`W` and accept :math:`H_0` if the sample lies outside
-   :math:`W`. In this way we make :math:`M` statements of rejection or
-   acceptance. Some of these statements will in general be wrong. If
-   :math:`H_0` is true and if :math:`M` is large, the probability is
-   nearly :math:`1` (i.e., it is practically certain) that the
-   proportion of wrong statements (i.e., the number of wrong statements
-   divided by :math:`M`) will be approximately :math:`\alpha`. If
-   :math:`H_1` is true, the probability is nearly :math:`1` that the
-   proportion of wrong statements will be approximately :math:`\beta`.
-   Thus, we can say that in the long run [ here Wald applies law of
-   large numbers by driving :math:`M \rightarrow \infty` (our comment,
-   not Wald's) ] the proportion of wrong statements will be
-   :math:`\alpha` if :math:`H_0`\ is true and :math:`\beta` if
-   :math:`H_1` is true.
+    The probabilities :math:`\alpha` and :math:`\beta` have the
+    following important practical interpretation: Suppose that we draw a
+    large number of samples of size :math:`n`. Let :math:`M` be the
+    number of such samples drawn. Suppose that for each of these
+    :math:`M` samples we reject :math:`H_0` if the sample is included in
+    :math:`W` and accept :math:`H_0` if the sample lies outside
+    :math:`W`. In this way we make :math:`M` statements of rejection or
+    acceptance. Some of these statements will in general be wrong. If
+    :math:`H_0` is true and if :math:`M` is large, the probability is
+    nearly :math:`1` (i.e., it is practically certain) that the
+    proportion of wrong statements (i.e., the number of wrong statements
+    divided by :math:`M`) will be approximately :math:`\alpha`. If
+    :math:`H_1` is true, the probability is nearly :math:`1` that the
+    proportion of wrong statements will be approximately :math:`\beta`.
+    Thus, we can say that in the long run [ here Wald applies law of
+    large numbers by driving :math:`M \rightarrow \infty` (our comment,
+    not Wald's) ] the proportion of wrong statements will be
+    :math:`\alpha` if :math:`H_0`\ is true and :math:`\beta` if
+    :math:`H_1` is true.
 
 The quantity :math:`\alpha` is called the *size* of the critical region,
 and the quantity :math:`1-\beta` is called the *power* of the critical
@@ -961,26 +962,26 @@ region.
 
 Wald notes that
 
-   one critical region :math:`W` is more desirable than another if it
-   has smaller values of :math:`\alpha` and :math:`\beta`. Although
-   either :math:`\alpha` or :math:`\beta` can be made arbitrarily small
-   by a proper choice of the critical region :math:`W`, it is possible
-   to make both :math:`\alpha` and :math:`\beta` arbitrarily small for a
-   fixed value of :math:`n`, i.e., a fixed sample size.
+    one critical region :math:`W` is more desirable than another if it
+    has smaller values of :math:`\alpha` and :math:`\beta`. Although
+    either :math:`\alpha` or :math:`\beta` can be made arbitrarily small
+    by a proper choice of the critical region :math:`W`, it is possible
+    to make both :math:`\alpha` and :math:`\beta` arbitrarily small for a
+    fixed value of :math:`n`, i.e., a fixed sample size.
 
 Wald summarizes Neyman and Pearson's setup as follows:
 
-  Neyman and Pearson show that a region consisting of all samples
-  :math:`(z_1, z_2, \ldots, z_n)` which satisfy the inequality
+    Neyman and Pearson show that a region consisting of all samples
+    :math:`(z_1, z_2, \ldots, z_n)` which satisfy the inequality
 
-  .. math::
+    .. math::
 
-    \frac{ f_1(z_1) \cdots f_1(z_n)}{f_0(z_1) \cdots f_0(z_n)} \geq k
+        \frac{ f_1(z_1) \cdots f_1(z_n)}{f_0(z_1) \cdots f_0(z_n)} \geq k
 
-  is a most powerful critical region for testing the hypothesis
-  :math:`H_0` against the alternative hypothesis :math:`H_1`. The term
-  :math:`k` on the right side is a constant chosen so that the region
-  will have the required size :math:`\alpha`.
+    is a most powerful critical region for testing the hypothesis
+    :math:`H_0` against the alternative hypothesis :math:`H_1`. The term
+    :math:`k` on the right side is a constant chosen so that the region
+    will have the required size :math:`\alpha`.
 
 
 Wald goes on to discuss Neyman and Pearson's concept of *uniformly most
@@ -988,20 +989,20 @@ powerful* test.
 
 Here is how Wald introduces the notion of a sequential test
 
-   A rule is given for making one of the following three decisions at any stage of
-   the experiment (at the m th trial for each integral value of m ): (1) to
-   accept the hypothesis H , (2) to reject the hypothesis H , (3) to
-   continue the experiment by making an additional observation. Thus, such
-   a test procedure is carried out sequentially. On the basis of the first
-   observation, one of the aforementioned decision is made. If the first or
-   second decision is made, the process is terminated. If the third
-   decision is made, a second trial is performed. Again, on the basis of
-   the first two observations, one of the three decision is made. If the
-   third decision is made, a third trial is performed, and so on. The
-   process is continued until either the first or the second decisions is
-   made. The number n of observations required by such a test procedure is
-   a random variable, since the value of n depends on the outcome of the
-   observations.
+    A rule is given for making one of the following three decisions at any stage of
+    the experiment (at the m th trial for each integral value of m ): (1) to
+    accept the hypothesis H , (2) to reject the hypothesis H , (3) to
+    continue the experiment by making an additional observation. Thus, such
+    a test procedure is carried out sequentially. On the basis of the first
+    observation, one of the aforementioned decision is made. If the first or
+    second decision is made, the process is terminated. If the third
+    decision is made, a second trial is performed. Again, on the basis of
+    the first two observations, one of the three decision is made. If the
+    third decision is made, a third trial is performed, and so on. The
+    process is continued until either the first or the second decisions is
+    made. The number n of observations required by such a test procedure is
+    a random variable, since the value of n depends on the outcome of the
+    observations.
 
 .. rubric:: Footnotes
 
diff --git a/source/rst/wealth_dynamics.rst b/source/rst/wealth_dynamics.rst
index 31f1531..1b0d201 100644
--- a/source/rst/wealth_dynamics.rst
+++ b/source/rst/wealth_dynamics.rst
@@ -14,7 +14,7 @@ In addition to what's in Anaconda, this lecture will need the following librarie
 .. code-block:: ipython
   :class: hide-output
 
-  !pip install --upgrade quantecon
+  !pip install quantecon
 
 
 Overview
@@ -68,7 +68,8 @@ We will use the following imports.
     %matplotlib inline
     
     import quantecon as qe
-    from numba import njit, jitclass, float64, prange
+    from numba import njit, float64, prange
+    from numba.experimental import jitclass
 
 
 Lorenz Curves and the Gini Coefficient
@@ -553,7 +554,7 @@ For the values of the tail index, use ``a_vals = np.linspace(1, 10, 25)``.
 
 Use sample of size 1,000 for each :math:`a` and the sampling method for generating Pareto draws employed in the discussion of Lorenz curves for the Pareto distribution.
 
-To the extend that you can, interpret the monotone relationship between the
+To the extent that you can, interpret the monotone relationship between the
 Gini index and :math:`a`.
 
 
@@ -576,7 +577,7 @@ At the same time, given the similarities, perhaps Pareto tails will arise.
 To test this, run a simulation that generates a cross-section of wealth and
 generate a rank-size plot.
 
-If you like, you can use the function ``rank_size_plot`` from the ``quantecon`` library (documentation `here <https://quanteconpy.readthedocs.io/en/latest/tools/inequality.html#quantecon.inequality.rank_size_plot>`__). 
+If you like, you can use the function ``rank_size`` from the ``quantecon`` library (documentation `here <https://quanteconpy.readthedocs.io/en/latest/tools/inequality.html#quantecon.inequality.rank_size>`__).
 
 In viewing the plot, remember that Pareto tails generate a straight line.  Is
 this what you see?
@@ -645,6 +646,9 @@ Now let's see the rank-size plot:
 
     fig, ax = plt.subplots()
 
-    qe.rank_size_plot(ψ_star, ax, c=0.001)
-    
+    rank_data, size_data = qe.rank_size(ψ_star, c=0.001)
+    ax.loglog(rank_data, size_data, 'o', markersize=3.0, alpha=0.5)
+    ax.set_xlabel("log rank")
+    ax.set_ylabel("log size")
+
     plt.show()
diff --git a/theme/minimal/static/sloan_logo.png b/theme/minimal/static/sloan_logo.png
index d17735c..a938a58 100644
Binary files a/theme/minimal/static/sloan_logo.png and b/theme/minimal/static/sloan_logo.png differ