Chapter 13 Solutions
Reinforcement Learning: An Introduction
1 Exercise 13.1
Consider the probability of right as p and left as 1-p. The states are labeled 1,2,3 from left to right:
\[\begin{align*} v_1 &= p(-1 + v_2) + (1-p)(-1 + v_1)\\ v_2 &= p(-1 + v_1) + (1-p)(-1 + v_3)\\ v_3 &= p(-1 + 0) + (1-p)(-1 + v_2) \\ \Rightarrow pv_1 &= pv_2 - 1 \\ v_2 &= pv_1 + (1 - p) v_3 - 1\\ v_3 &= (1-p)v_2 - 1 \\ \Rightarrow p(2-p)v_1 + (2-p) &= pv_1 + p - 2\\ \Rightarrow (p - p^2)v_1 &= 2(2-p) \\ \Rightarrow v_1 = \frac{2(2-p)}{p - p^2} \end{align*}\]Therefore,
\[\begin{align*} p^* = \operatorname*{argmax}_p (\frac{2(2-p)}{p - p^2}) \\ \frac{d}{dp}(\frac{2(2-p)}{p - p^2}) = 0 \\ \Rightarrow \frac{-2p^2+8p-4}{(p-p^2)^2} = 0 \\ \Rightarrow p^* = \frac{4 \pm 2\sqrt{2}}{2} = 2 \pm \sqrt{2} \\ \xrightarrow{p^* <= 1} p^* = 2 - \sqrt{2} \approx 0.59 \end{align*}\]2 Exercise 13.2
\[ \newcommand{\thetab}{\boldsymbol{\theta}} \newcommand{\E}{\mathbb{E}} \]
\[ \begin{align*} \eta(s) = h(s) + \gamma \sum_{\overline{s}} \eta(\overline{s})\sum_a \pi(a|\overline{s})p(s|\overline{s}, a)\\ \mu(s) = \frac{\eta(s)}{\sum_{s'}\eta(s')}\\ \end{align*} \]
\[ \begin{align*} \nabla v_\pi(s) &= \nabla [\sum_a \pi(a|s)q_\pi(s,a)] \\ &= \sum_a [\nabla \pi(a|s)q_\pi(s,a) + \pi(a|s) \nabla q_\pi(s, a)] \\ &= \sum_a [\nabla \pi(a|s)q_\pi(s,a) + \pi(a|s) \nabla \sum_{s',r} p(s',r|s,a)(r + \gamma v_\pi(s'))] \\ &= \sum_a [\nabla \pi(a|s)q_\pi(s,a) + \pi(a|s) \sum_{s',r} p(s',r|s,a)\gamma \nabla v_\pi(s'))]\\ &= \sum_{x \in s} \sum_{k=0}^\infty Pr(s \rightarrow x, k, \pi) \gamma^k \sum_a \nabla \pi(a|x) q_\pi(x,a) \end{align*} \]
\[ \begin{align*} \nabla J(\thetab) &= \nabla v_\pi(s_0) \\ &= \sum_s \left( \sum_{k=0}^\infty Pr(s \rightarrow x, k, \pi) \gamma^k \right) \sum_a \nabla \pi(a|s) q_\pi(s,a)\\ &= \sum_s \eta(s) \sum_a \nabla \pi(a|s) q_\pi(s,a)\\ &= \sum_{s'} \eta(s') \sum_s \frac{\eta(s)}{\sum_{s'} \eta(s')} \sum_a \nabla \pi(a|s) q_\pi(s,a)\\ &\propto \sum_{s} \mu(s) \sum_a \nabla \pi(a|s) q_\pi(s,a)\\ &= \E_\pi\left[\gamma^t \sum_a q_\pi(S_t, a) \nabla \pi(a|S_t, \thetab)\right] \quad \text{Expectation under policy $\pi$ with termination $\gamma$}\\ &= \E_\pi\left[\gamma^t \sum_a \pi(a|S_t, \thetab) q_\pi(S_t, a) \frac{\nabla \pi(a|S_t, \thetab)}{\pi(a|S_t, \thetab)}\right]\\ &= \E_\pi\left[\gamma^t q_\pi(S_t, A_t)\frac{\nabla \pi(A_t|S_t, \thetab)}{\pi(A_t|S_t, \thetab)}\right] \\ &= \E_\pi\left[\gamma^t G_t \frac{\nabla \pi(A_t|S_t, \thetab)}{\pi(A_t|S_t, \thetab)}\right] \\ &\Rightarrow \thetab_{t+1} = \thetab_t + \alpha \gamma^t G_t \frac{\nabla \pi(A_t|S_t, \thetab_t)}{\pi(A_t|S_t, \thetab_t)} \end{align*} \]
3 Exercise 13.3
\[ \newcommand{\x}{\textbf{x}} \]
Considering \(\pi(a|s, \thetab) = \frac{e^{h(s,a,\thetab)}}{\sum_b e^{h(s, b, \thetab)}}\) and \(h(s,a,\thetab) = \thetab^T \x(s,a)\) or \(h(s,b,\thetab) = \thetab^T \x(s,b)\):
\[ \begin{align*} \nabla \ln \pi(a|s, \thetab) &= \nabla (h(s,a,\thetab) - \ln \sum_b e^{h(s,b,\thetab)}) \\ &= \nabla h(s,a,\thetab) - \nabla \ln \sum_b e^{h(s,b,\thetab)}\\ &= \nabla (\thetab^T \x(s,a)) - \nabla \ln \sum_b e^{h(s,b,\thetab)} \\ &= \x(s,a) - \frac{\sum_b\x(s,b)e^{h(s,b,\thetab)}}{\sum_b e^{h(s,b,\thetab)}} \\ &= \x(s,a) - \frac{\sum_b\x(s,b)e^{h(s,b,\thetab)}}{\sum_b e^{h(s,b,\thetab)}} \\ &= \x(s,a) - \sum_b \pi(b|s,\thetab) \x(s,b) \end{align*} \]
4 Exercise 13.4
Note: \(\nabla_{\theta_\mu} \mu(s,\thetab) = \x_\mu(s)\) and \(\nabla_{\theta_\sigma} \sigma(s,\thetab) = \x_\sigma(s)\).
\[ \begin{align*} \ln \pi(a|s, \thetab) &= -\frac{(a-\mu(s,\thetab))^2}{2\sigma^2(s,\thetab)} -\ln \sigma(s,\thetab) - \ln \sqrt{2\pi} \\ \nabla_{\thetab_\mu} \ln \pi(a|s, \thetab_\mu) &= \nabla_{\thetab_\mu} (-\frac{(a-\mu(s,\thetab))^2}{2\sigma^2(s,\thetab)} -\ln \sigma(s,\thetab) - \ln \sqrt{2\pi})\\ &= \frac{2(a - \mu(s, \thetab))\x_\mu(s)}{2 \sigma^2 (s, \thetab)} = \frac{(a - \mu(s, \thetab))\x_\mu(s)}{\sigma^2 (s, \thetab)} \end{align*} \]
\[ \begin{align*} \nabla_{\thetab_\mu} \ln \pi(a|s, \thetab_\sigma) &= \nabla_{\thetab_\sigma} (-\frac{(a-\mu(s,\thetab))^2}{2\sigma^2(s,\thetab)} -\ln \sigma(s,\thetab) - \ln \sqrt{2\pi}) \\ &= \frac{(a-\mu(s,\thetab))^2 \x_\sigma(s)\sigma(s,\thetab)}{\sigma^3(s,\thetab)} - \x_\sigma(s) \\ &= \frac{(a-\mu(s,\thetab))^2 \x_\sigma(s)}{\sigma^2(s,\thetab)} - \x_\sigma(s) \\ &=(\frac{(a-\mu(s,\thetab))^2}{\sigma^2(s,\thetab)} - 1) \x_\sigma(s) \end{align*} \]
5 Exercise 13.5
Note that: \(h(s,1,\thetab) = \thetab^T\x(s) + h(s,0,\thetab)\).
\[ \begin{align*} P_t &= \pi(1|S_t, \thetab_t) = \frac{e^{h(S_t,1,\thetab)}}{e^{h(S_t,1,\theta)} + e^{h(s,0,\theta)}} \\ &= \frac{e^{ \thetab^T\x(S_t) + h(S_t,0,\thetab)}}{e^{\thetab^T\x(S_t) + h(S_t,0,\thetab)} + e^{h(S_t,0,\theta)}}\\ &= \frac{1}{1 + \frac{e^{h(S_t,0,\theta)}}{e^{\thetab^T\x(S_t) + h(S_t,0,\thetab)}}}\\ &= \frac{1}{1 + e^{-\thetab^T\x(S_t)}} \end{align*} \]
\[ \begin{align*} \thetab_{t+1} = \thetab_t + \alpha \gamma^t G_t \nabla \ln \pi(a|S_t, \thetab_t) \end{align*} \]
\[ \begin{align*} P = \pi(1|s, \thetab) &= \frac{1}{1 + e^{-\thetab^T\x(s)}} \\ \Rightarrow \nabla P &= \x(s)P(1 - P) \\ \nabla ln(P) &= \frac{\nabla P}{P} = \x(s)(1-P)\\ \nabla ln(1-P) &= \frac{-\nabla P}{1-P} = \x(s)P\\ \Rightarrow \nabla ln(\pi(a|s,\thetab)) &= \frac{(a - P)\nabla P}{P(1-P)}\\ &= \frac{(a - P)\x(s)P(1-P)}{P(1-P)}\\ &= (a-P)\x(s) = (a-\pi(1|s,\thetab))\x(s) \end{align*} \]