from copy import deepcopy
import pandas as pd
from scipy.optimize import linprog
from sympy import eye, nan, Matrix, Rational
states = pd.Index(["0_hungry", "1_full"], name="state")
actions = pd.Index(["0_ignore", "1_feed"], name="action")
rewards = pd.Index([-3, -2, 1, 2], name="reward")
state_actions = pd.MultiIndex.from_product([states, actions])
state_action_reward_state1s = pd.MultiIndex.from_product(
[states, actions, rewards, states],
names=("state", "action", "reward", "next_state"))
state_count = len(states)
action_count = len(actions)
state_action_count = len(state_actions)
Basic formulation of environment
s0_probs = pd.Series(Rational(1, 2), index=states)
print("initial state distribution:")
display(s0_probs.to_frame("prob").reset_index())
s_a_r_s1_probs = pd.Series(0, index=state_action_reward_state1s, dtype=object)
s_a_r_s1_probs.loc[("0_hungry", "0_ignore", -2, "0_hungry")] = 1
s_a_r_s1_probs.loc[("0_hungry", "1_feed", -3, "0_hungry")] = Rational(1, 3)
s_a_r_s1_probs.loc[("0_hungry", "1_feed", 1, "1_full")] = Rational(2, 3)
s_a_r_s1_probs.loc[("1_full", "0_ignore", -2, "0_hungry")] = Rational(3, 4)
s_a_r_s1_probs.loc[("1_full", "0_ignore", 2, "1_full")] = Rational(1, 4)
s_a_r_s1_probs.loc[("1_full", "1_feed", 1, "1_full")] = 1
print("transition probability from a state-action pair to reward and next state:")
display(s_a_r_s1_probs.to_frame("prob").reset_index())
initial state distribution:
| state | prob | |
|---|---|---|
| 0 | 0_hungry | 1/2 |
| 1 | 1_full | 1/2 |
transition probability from a state-action pair to reward and next state:
| state | action | reward | next_state | prob | |
|---|---|---|---|---|---|
| 0 | 0_hungry | 0_ignore | -3 | 0_hungry | 0 |
| 1 | 0_hungry | 0_ignore | -3 | 1_full | 0 |
| 2 | 0_hungry | 0_ignore | -2 | 0_hungry | 1 |
| 3 | 0_hungry | 0_ignore | -2 | 1_full | 0 |
| 4 | 0_hungry | 0_ignore | 1 | 0_hungry | 0 |
| 5 | 0_hungry | 0_ignore | 1 | 1_full | 0 |
| 6 | 0_hungry | 0_ignore | 2 | 0_hungry | 0 |
| 7 | 0_hungry | 0_ignore | 2 | 1_full | 0 |
| 8 | 0_hungry | 1_feed | -3 | 0_hungry | 1/3 |
| 9 | 0_hungry | 1_feed | -3 | 1_full | 0 |
| 10 | 0_hungry | 1_feed | -2 | 0_hungry | 0 |
| 11 | 0_hungry | 1_feed | -2 | 1_full | 0 |
| 12 | 0_hungry | 1_feed | 1 | 0_hungry | 0 |
| 13 | 0_hungry | 1_feed | 1 | 1_full | 2/3 |
| 14 | 0_hungry | 1_feed | 2 | 0_hungry | 0 |
| 15 | 0_hungry | 1_feed | 2 | 1_full | 0 |
| 16 | 1_full | 0_ignore | -3 | 0_hungry | 0 |
| 17 | 1_full | 0_ignore | -3 | 1_full | 0 |
| 18 | 1_full | 0_ignore | -2 | 0_hungry | 3/4 |
| 19 | 1_full | 0_ignore | -2 | 1_full | 0 |
| 20 | 1_full | 0_ignore | 1 | 0_hungry | 0 |
| 21 | 1_full | 0_ignore | 1 | 1_full | 0 |
| 22 | 1_full | 0_ignore | 2 | 0_hungry | 0 |
| 23 | 1_full | 0_ignore | 2 | 1_full | 1/4 |
| 24 | 1_full | 1_feed | -3 | 0_hungry | 0 |
| 25 | 1_full | 1_feed | -3 | 1_full | 0 |
| 26 | 1_full | 1_feed | -2 | 0_hungry | 0 |
| 27 | 1_full | 1_feed | -2 | 1_full | 0 |
| 28 | 1_full | 1_feed | 1 | 0_hungry | 0 |
| 29 | 1_full | 1_feed | 1 | 1_full | 1 |
| 30 | 1_full | 1_feed | 2 | 0_hungry | 0 |
| 31 | 1_full | 1_feed | 2 | 1_full | 0 |
Derived from basic formulation of environment
s_a_s1_probs = s_a_r_s1_probs.groupby(["state", "action", "next_state"]).sum()
print("transition probability from a state-action pair to next state:")
display(s_a_s1_probs.to_frame("prob").reset_index())
s_a_s1_rewards = s_a_r_s1_probs.unstack(level="reward")[rewards].multiply(
rewards, axis=1).sum(axis=1, numeric_only=False)
s_a_rewards = s_a_s1_rewards.groupby(["state", "action"]).sum()
print("expected reward given a state-action pair:")
display(s_a_rewards.to_frame("reward").reset_index())
s_a_s1_rewards = s_a_s1_rewards / s_a_s1_probs.where(s_a_s1_probs > 0, nan)
print("expected reward from a state-action pair to next state:")
display(s_a_s1_rewards.to_frame("reward").reset_index())
transition probability from a state-action pair to next state:
| state | action | next_state | prob | |
|---|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 0_hungry | 1 |
| 1 | 0_hungry | 0_ignore | 1_full | 0 |
| 2 | 0_hungry | 1_feed | 0_hungry | 1/3 |
| 3 | 0_hungry | 1_feed | 1_full | 2/3 |
| 4 | 1_full | 0_ignore | 0_hungry | 3/4 |
| 5 | 1_full | 0_ignore | 1_full | 1/4 |
| 6 | 1_full | 1_feed | 0_hungry | 0 |
| 7 | 1_full | 1_feed | 1_full | 1 |
expected reward given a state-action pair:
| state | action | reward | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | -2 |
| 1 | 0_hungry | 1_feed | -1/3 |
| 2 | 1_full | 0_ignore | -1 |
| 3 | 1_full | 1_feed | 1 |
expected reward from a state-action pair to next state:
| state | action | next_state | reward | |
|---|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 0_hungry | -2.0 |
| 1 | 0_hungry | 0_ignore | 1_full | nan |
| 2 | 0_hungry | 1_feed | 0_hungry | -3 |
| 3 | 0_hungry | 1_feed | 1_full | 1 |
| 4 | 1_full | 0_ignore | 0_hungry | -2 |
| 5 | 1_full | 0_ignore | 1_full | 2 |
| 6 | 1_full | 1_feed | 0_hungry | nan |
| 7 | 1_full | 1_feed | 1_full | 1.0 |
Basic formulation of policy
s_a_probs = pd.Series(0, index=state_actions, dtype=object)
s_a_probs.loc["0_hungry", "0_ignore"] = Rational(1, 4)
s_a_probs.loc["0_hungry", "1_feed"] = Rational(3, 4)
s_a_probs.loc["1_full", "0_ignore"] = Rational(5, 6)
s_a_probs.loc["1_full", "1_feed"] = Rational(1, 6)
print("transition probability from a state to an action:")
display(s_a_probs.to_frame("prob").reset_index())
transition probability from a state to an action:
| state | action | prob | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 1/4 |
| 1 | 0_hungry | 1_feed | 3/4 |
| 2 | 1_full | 0_ignore | 5/6 |
| 3 | 1_full | 1_feed | 1/6 |
Derived from basic formulation of both environment and policy
s0_a0_probs = s0_probs * s_a_probs
print("initial state-action distribution:")
display(s0_a0_probs.to_frame("prob").reset_index())
s_s1_probs = (s_a_s1_probs * s_a_probs).groupby(["state", "next_state"]).sum()
print("transition probability from a state to next state:")
display(s_s1_probs.to_frame("prob").reset_index())
s1_a1_probs = deepcopy(s_a_probs)
s1_a1_probs.index.names = ["next_state", "next_action"]
s_a_s1_a1_probs = s_a_s1_probs * s1_a1_probs
print("transition probability from a state-action pair to next state-action pair:")
display(s_a_s1_a1_probs.to_frame("prob").reset_index()[["state", "action", "next_state", "next_action", "prob"]])
s_rewards = (s_a_rewards * s_a_probs).groupby("state").sum()
print("expected reward given a state:")
display(s_rewards.to_frame("reward").reset_index())
initial state-action distribution:
| state | action | prob | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 1/8 |
| 1 | 0_hungry | 1_feed | 3/8 |
| 2 | 1_full | 0_ignore | 5/12 |
| 3 | 1_full | 1_feed | 1/12 |
transition probability from a state to next state:
| state | next_state | prob | |
|---|---|---|---|
| 0 | 0_hungry | 0_hungry | 1/2 |
| 1 | 0_hungry | 1_full | 1/2 |
| 2 | 1_full | 0_hungry | 5/8 |
| 3 | 1_full | 1_full | 3/8 |
transition probability from a state-action pair to next state-action pair:
| state | action | next_state | next_action | prob | |
|---|---|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 0_hungry | 0_ignore | 1/4 |
| 1 | 0_hungry | 0_ignore | 0_hungry | 1_feed | 3/4 |
| 2 | 0_hungry | 1_feed | 0_hungry | 0_ignore | 1/12 |
| 3 | 0_hungry | 1_feed | 0_hungry | 1_feed | 1/4 |
| 4 | 1_full | 0_ignore | 0_hungry | 0_ignore | 3/16 |
| 5 | 1_full | 0_ignore | 0_hungry | 1_feed | 9/16 |
| 6 | 1_full | 1_feed | 0_hungry | 0_ignore | 0 |
| 7 | 1_full | 1_feed | 0_hungry | 1_feed | 0 |
| 8 | 0_hungry | 0_ignore | 1_full | 0_ignore | 0 |
| 9 | 0_hungry | 0_ignore | 1_full | 1_feed | 0 |
| 10 | 0_hungry | 1_feed | 1_full | 0_ignore | 5/9 |
| 11 | 0_hungry | 1_feed | 1_full | 1_feed | 1/9 |
| 12 | 1_full | 0_ignore | 1_full | 0_ignore | 5/24 |
| 13 | 1_full | 0_ignore | 1_full | 1_feed | 1/24 |
| 14 | 1_full | 1_feed | 1_full | 0_ignore | 5/6 |
| 15 | 1_full | 1_feed | 1_full | 1_feed | 1/6 |
expected reward given a state:
| state | reward | |
|---|---|---|
| 0 | 0_hungry | -3/4 |
| 1 | 1_full | -2/3 |
gamma = Rational(4, 5)
print("discount factor:", gamma)
discount factor: 4/5
Calculate values
s_reward_vector = Matrix(s_rewards)
s_s1_prob_matrix = Matrix(s_s1_probs.unstack(level="next_state").values)
s_s1_dinv_matrix = (eye(state_count) - gamma * s_s1_prob_matrix).inv()
s_value_vector = s_s1_dinv_matrix * s_reward_vector
s_values = pd.Series({state: value for state, value in zip(states, s_value_vector)}, index=states)
print("state values:")
display(s_values.to_frame("value").reset_index())
s1_values = deepcopy(s_values)
s1_values.index.name = "next_state"
s_a_values = s_a_rewards + gamma * (s_a_s1_probs * s1_values).groupby(["state", "action"]).sum()
print("action values:")
display(s_a_values.to_frame("value").reset_index())
state values:
| state | value | |
|---|---|---|
| 0 | 0_hungry | -475/132 |
| 1 | 1_full | -155/44 |
action values:
| state | action | value | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | -161/33 |
| 1 | 0_hungry | 1_feed | -314/99 |
| 2 | 1_full | 0_ignore | -85/22 |
| 3 | 1_full | 1_feed | -20/11 |
s_a_reward_vector = Matrix(s_a_rewards)
s_a_s1_a1_prob_matrix = Matrix(s_a_s1_a1_probs.unstack(level=["next_state", "next_action"]).values)
s_a_dinv_matrix = (eye(state_action_count) - gamma * s_a_s1_a1_prob_matrix).inv()
s_a_value_vector = s_a_dinv_matrix * s_a_reward_vector
s_a_values = pd.Series({state_action: value for state_action, value
in zip(state_actions, s_a_value_vector)}, index=state_actions)
print("action values:")
display(s_a_values.to_frame("value").reset_index())
s_values = (s_a_values * s_a_probs).groupby("state").sum()
print("state values:")
display(s_values.to_frame("value").reset_index())
action values:
| state | action | value | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | -161/33 |
| 1 | 0_hungry | 1_feed | -314/99 |
| 2 | 1_full | 0_ignore | -85/22 |
| 3 | 1_full | 1_feed | -20/11 |
state values:
| state | value | |
|---|---|---|
| 0 | 0_hungry | -475/132 |
| 1 | 1_full | -155/44 |
Expected return
g = (s0_probs * s_values).sum()
print("expected return at t=0:", g)
expected return at t=0: -235/66
Improve policy
improvable = ((s_a_probs > 0.) & s_a_values.lt(s_a_values.groupby("state").max())).any()
print("policy can be improved:", improvable)
improved_s_actions = s_a_values.unstack(level="action").astype(float).idxmax()
improved_s_actions.index.name = "state"
print("improved policy:")
display(improved_s_actions.to_frame("action").reset_index())
policy can be improved: True improved policy:
| state | action | |
|---|---|---|
| 0 | 0_ignore | 1_full |
| 1 | 1_feed | 1_full |
Calculate visitation frequency
s0_prob_vector = Matrix(s0_probs)
s1_s_prob_matrix = Matrix(s_s1_probs.unstack(level="next_state").values).T
s1_s_dinv_matrix = (eye(state_count) - gamma * s1_s_prob_matrix).inv()
s_freq_vector = s1_s_dinv_matrix * s0_prob_vector
s_freqs = pd.Series({state: freq for state, freq in zip(states, s_freq_vector)}, index=states)
print("state visitation frequencies:")
display(s_freqs.to_frame("freq").reset_index())
s_a_freqs = s_a_probs * s_freqs
print("state-action visitation frequencies:")
display(s_a_freqs.to_frame("freq").reset_index())
state visitation frequencies:
| state | freq | |
|---|---|---|
| 0 | 0_hungry | 30/11 |
| 1 | 1_full | 25/11 |
state-action visitation frequencies:
| state | action | freq | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 15/22 |
| 1 | 0_hungry | 1_feed | 45/22 |
| 2 | 1_full | 0_ignore | 125/66 |
| 3 | 1_full | 1_feed | 25/66 |
s0_a0_prob_vector = Matrix(s0_a0_probs)
s1_a1_s_a_prob_matrix = Matrix(s_a_s1_a1_probs.unstack(level=["next_state", "next_action"]).values).T
s1_a1_s_a_dinv_matrix = (eye(state_action_count) - gamma * s1_a1_s_a_prob_matrix).inv()
s_a_freq_vector = s1_a1_s_a_dinv_matrix * s0_a0_prob_vector
s_a_freqs = pd.Series({state_action: freq for state_action, freq in zip(state_actions, s_a_freq_vector)}, index=state_actions)
print("state-action visitation frequencies:")
display(s_a_freqs.to_frame("freq").reset_index())
s_freqs = s_a_freqs.groupby("state").sum()
print("state visitation frequencies:")
display(s_freqs.to_frame("freq").reset_index())
state-action visitation frequencies:
| state | action | freq | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 15/22 |
| 1 | 0_hungry | 1_feed | 45/22 |
| 2 | 1_full | 0_ignore | 125/66 |
| 3 | 1_full | 1_feed | 25/66 |
state visitation frequencies:
| state | freq | |
|---|---|---|
| 0 | 0_hungry | 30/11 |
| 1 | 1_full | 25/11 |
Expected return
g = (s_a_freqs * s_a_rewards).sum()
print("expected return at t=0:", g)
expected return at t=0: -235/66
Optimal values
s_a_s1_eye = deepcopy(s_a_s1_probs) * 0.
s_a_s1_eye.loc[s_a_s1_eye.index.get_level_values("state") == s_a_s1_eye.index.get_level_values("next_state")] = 1
A = (s_a_s1_eye - gamma * s_a_s1_probs).unstack(level="next_state")
b = s_a_rewards
res = linprog(c=s0_probs, A_ub=-A.values, b_ub=-b.values)
s_optvalues = pd.Series([Rational(x).limit_denominator(1000) for x in res.x], index=states)
print("optimal state values:")
display(s_optvalues.to_frame("optvalue").reset_index())
s1_optvalues = deepcopy(s_optvalues)
s1_optvalues.index.name = "next_state"
s_a_optvalues = s_a_rewards + gamma * (s_a_s1_probs * s1_optvalues).groupby(["state", "action"]).sum()
print("optimal action values:")
display(s_a_optvalues.to_frame("optvalue").reset_index())
optimal state values:
| state | optvalue | |
|---|---|---|
| 0 | 0_hungry | 35/11 |
| 1 | 1_full | 5 |
optimal action values:
| state | action | optvalue | |
|---|---|---|---|
| 0 | 0_hungry | 0_ignore | 6/11 |
| 1 | 0_hungry | 1_feed | 35/11 |
| 2 | 1_full | 0_ignore | 21/11 |
| 3 | 1_full | 1_feed | 5 |
Optimal policy
s_optactions = s_a_optvalues.unstack(level="action").astype(float).idxmax(axis=1)
print("optimal policy:")
display(s_optactions.to_frame("optaction").reset_index())
optimal policy:
| state | optaction | |
|---|---|---|
| 0 | 0_hungry | 1_feed |
| 1 | 1_full | 1_feed |
Optimal expected return
optg = (s0_probs * s_optvalues).sum()
print("optimal expected return at t=0:", optg)
optimal expected return at t=0: 45/11