Example: Feed and Full¶

In [1]:
from copy import deepcopy
import pandas as pd
from scipy.optimize import linprog
from sympy import eye, nan, Matrix, Rational

Spaces¶

In [2]:
states = pd.Index(["0_hungry", "1_full"], name="state")
actions = pd.Index(["0_ignore", "1_feed"], name="action")
rewards = pd.Index([-3, -2, 1, 2], name="reward")
state_actions = pd.MultiIndex.from_product([states, actions])
state_action_reward_state1s = pd.MultiIndex.from_product(
        [states, actions, rewards, states],
        names=("state", "action", "reward", "next_state"))
state_count = len(states)
action_count = len(actions)
state_action_count = len(state_actions)

Environments and Agent¶

Basic formulation of environment

In [3]:
s0_probs = pd.Series(Rational(1, 2), index=states)
print("initial state distribution:")
display(s0_probs.to_frame("prob").reset_index())

s_a_r_s1_probs = pd.Series(0, index=state_action_reward_state1s, dtype=object)
s_a_r_s1_probs.loc[("0_hungry", "0_ignore", -2, "0_hungry")] = 1
s_a_r_s1_probs.loc[("0_hungry", "1_feed", -3, "0_hungry")] = Rational(1, 3)
s_a_r_s1_probs.loc[("0_hungry", "1_feed", 1, "1_full")] = Rational(2, 3)
s_a_r_s1_probs.loc[("1_full", "0_ignore", -2, "0_hungry")] = Rational(3, 4)
s_a_r_s1_probs.loc[("1_full", "0_ignore", 2, "1_full")] = Rational(1, 4)
s_a_r_s1_probs.loc[("1_full", "1_feed", 1, "1_full")] = 1
print("transition probability from a state-action pair to reward and next state:")
display(s_a_r_s1_probs.to_frame("prob").reset_index())
initial state distribution:
state prob
0 0_hungry 1/2
1 1_full 1/2
transition probability from a state-action pair to reward and next state:
state action reward next_state prob
0 0_hungry 0_ignore -3 0_hungry 0
1 0_hungry 0_ignore -3 1_full 0
2 0_hungry 0_ignore -2 0_hungry 1
3 0_hungry 0_ignore -2 1_full 0
4 0_hungry 0_ignore 1 0_hungry 0
5 0_hungry 0_ignore 1 1_full 0
6 0_hungry 0_ignore 2 0_hungry 0
7 0_hungry 0_ignore 2 1_full 0
8 0_hungry 1_feed -3 0_hungry 1/3
9 0_hungry 1_feed -3 1_full 0
10 0_hungry 1_feed -2 0_hungry 0
11 0_hungry 1_feed -2 1_full 0
12 0_hungry 1_feed 1 0_hungry 0
13 0_hungry 1_feed 1 1_full 2/3
14 0_hungry 1_feed 2 0_hungry 0
15 0_hungry 1_feed 2 1_full 0
16 1_full 0_ignore -3 0_hungry 0
17 1_full 0_ignore -3 1_full 0
18 1_full 0_ignore -2 0_hungry 3/4
19 1_full 0_ignore -2 1_full 0
20 1_full 0_ignore 1 0_hungry 0
21 1_full 0_ignore 1 1_full 0
22 1_full 0_ignore 2 0_hungry 0
23 1_full 0_ignore 2 1_full 1/4
24 1_full 1_feed -3 0_hungry 0
25 1_full 1_feed -3 1_full 0
26 1_full 1_feed -2 0_hungry 0
27 1_full 1_feed -2 1_full 0
28 1_full 1_feed 1 0_hungry 0
29 1_full 1_feed 1 1_full 1
30 1_full 1_feed 2 0_hungry 0
31 1_full 1_feed 2 1_full 0

Derived from basic formulation of environment

In [4]:
s_a_s1_probs = s_a_r_s1_probs.groupby(["state", "action", "next_state"]).sum()
print("transition probability from a state-action pair to next state:")
display(s_a_s1_probs.to_frame("prob").reset_index())

s_a_s1_rewards = s_a_r_s1_probs.unstack(level="reward")[rewards].multiply(
        rewards, axis=1).sum(axis=1, numeric_only=False)
s_a_rewards = s_a_s1_rewards.groupby(["state", "action"]).sum()
print("expected reward given a state-action pair:")
display(s_a_rewards.to_frame("reward").reset_index())

s_a_s1_rewards = s_a_s1_rewards / s_a_s1_probs.where(s_a_s1_probs > 0, nan)
print("expected reward from a state-action pair to next state:")
display(s_a_s1_rewards.to_frame("reward").reset_index())
transition probability from a state-action pair to next state:
state action next_state prob
0 0_hungry 0_ignore 0_hungry 1
1 0_hungry 0_ignore 1_full 0
2 0_hungry 1_feed 0_hungry 1/3
3 0_hungry 1_feed 1_full 2/3
4 1_full 0_ignore 0_hungry 3/4
5 1_full 0_ignore 1_full 1/4
6 1_full 1_feed 0_hungry 0
7 1_full 1_feed 1_full 1
expected reward given a state-action pair:
state action reward
0 0_hungry 0_ignore -2
1 0_hungry 1_feed -1/3
2 1_full 0_ignore -1
3 1_full 1_feed 1
expected reward from a state-action pair to next state:
state action next_state reward
0 0_hungry 0_ignore 0_hungry -2.0
1 0_hungry 0_ignore 1_full nan
2 0_hungry 1_feed 0_hungry -3
3 0_hungry 1_feed 1_full 1
4 1_full 0_ignore 0_hungry -2
5 1_full 0_ignore 1_full 2
6 1_full 1_feed 0_hungry nan
7 1_full 1_feed 1_full 1.0

Basic formulation of policy

In [5]:
s_a_probs = pd.Series(0, index=state_actions, dtype=object)
s_a_probs.loc["0_hungry", "0_ignore"] = Rational(1, 4)
s_a_probs.loc["0_hungry", "1_feed"] = Rational(3, 4)
s_a_probs.loc["1_full", "0_ignore"] = Rational(5, 6)
s_a_probs.loc["1_full", "1_feed"] = Rational(1, 6)
print("transition probability from a state to an action:")
display(s_a_probs.to_frame("prob").reset_index())
transition probability from a state to an action:
state action prob
0 0_hungry 0_ignore 1/4
1 0_hungry 1_feed 3/4
2 1_full 0_ignore 5/6
3 1_full 1_feed 1/6

Derived from basic formulation of both environment and policy

In [6]:
s0_a0_probs = s0_probs * s_a_probs
print("initial state-action distribution:")
display(s0_a0_probs.to_frame("prob").reset_index())

s_s1_probs = (s_a_s1_probs * s_a_probs).groupby(["state", "next_state"]).sum()
print("transition probability from a state to next state:")
display(s_s1_probs.to_frame("prob").reset_index())

s1_a1_probs = deepcopy(s_a_probs)
s1_a1_probs.index.names = ["next_state", "next_action"]
s_a_s1_a1_probs = s_a_s1_probs * s1_a1_probs
print("transition probability from a state-action pair to next state-action pair:")
display(s_a_s1_a1_probs.to_frame("prob").reset_index()[["state", "action", "next_state", "next_action", "prob"]])

s_rewards = (s_a_rewards * s_a_probs).groupby("state").sum()
print("expected reward given a state:")
display(s_rewards.to_frame("reward").reset_index())
initial state-action distribution:
state action prob
0 0_hungry 0_ignore 1/8
1 0_hungry 1_feed 3/8
2 1_full 0_ignore 5/12
3 1_full 1_feed 1/12
transition probability from a state to next state:
state next_state prob
0 0_hungry 0_hungry 1/2
1 0_hungry 1_full 1/2
2 1_full 0_hungry 5/8
3 1_full 1_full 3/8
transition probability from a state-action pair to next state-action pair:
state action next_state next_action prob
0 0_hungry 0_ignore 0_hungry 0_ignore 1/4
1 0_hungry 0_ignore 0_hungry 1_feed 3/4
2 0_hungry 1_feed 0_hungry 0_ignore 1/12
3 0_hungry 1_feed 0_hungry 1_feed 1/4
4 1_full 0_ignore 0_hungry 0_ignore 3/16
5 1_full 0_ignore 0_hungry 1_feed 9/16
6 1_full 1_feed 0_hungry 0_ignore 0
7 1_full 1_feed 0_hungry 1_feed 0
8 0_hungry 0_ignore 1_full 0_ignore 0
9 0_hungry 0_ignore 1_full 1_feed 0
10 0_hungry 1_feed 1_full 0_ignore 5/9
11 0_hungry 1_feed 1_full 1_feed 1/9
12 1_full 0_ignore 1_full 0_ignore 5/24
13 1_full 0_ignore 1_full 1_feed 1/24
14 1_full 1_feed 1_full 0_ignore 5/6
15 1_full 1_feed 1_full 1_feed 1/6
expected reward given a state:
state reward
0 0_hungry -3/4
1 1_full -2/3

Discount¶

In [7]:
gamma = Rational(4, 5)
print("discount factor:", gamma)
discount factor: 4/5

Value¶

Calculate values

In [8]:
s_reward_vector = Matrix(s_rewards)
s_s1_prob_matrix = Matrix(s_s1_probs.unstack(level="next_state").values)
s_s1_dinv_matrix = (eye(state_count) - gamma * s_s1_prob_matrix).inv()
s_value_vector = s_s1_dinv_matrix * s_reward_vector
s_values = pd.Series({state: value for state, value in zip(states, s_value_vector)}, index=states)
print("state values:")
display(s_values.to_frame("value").reset_index())

s1_values = deepcopy(s_values)
s1_values.index.name = "next_state"
s_a_values = s_a_rewards + gamma * (s_a_s1_probs * s1_values).groupby(["state", "action"]).sum()
print("action values:")
display(s_a_values.to_frame("value").reset_index())
state values:
state value
0 0_hungry -475/132
1 1_full -155/44
action values:
state action value
0 0_hungry 0_ignore -161/33
1 0_hungry 1_feed -314/99
2 1_full 0_ignore -85/22
3 1_full 1_feed -20/11
In [9]:
s_a_reward_vector = Matrix(s_a_rewards)
s_a_s1_a1_prob_matrix = Matrix(s_a_s1_a1_probs.unstack(level=["next_state", "next_action"]).values)
s_a_dinv_matrix = (eye(state_action_count) - gamma * s_a_s1_a1_prob_matrix).inv()
s_a_value_vector = s_a_dinv_matrix * s_a_reward_vector
s_a_values = pd.Series({state_action: value for state_action, value
        in zip(state_actions, s_a_value_vector)}, index=state_actions)
print("action values:")
display(s_a_values.to_frame("value").reset_index())

s_values = (s_a_values * s_a_probs).groupby("state").sum()
print("state values:")
display(s_values.to_frame("value").reset_index())
action values:
state action value
0 0_hungry 0_ignore -161/33
1 0_hungry 1_feed -314/99
2 1_full 0_ignore -85/22
3 1_full 1_feed -20/11
state values:
state value
0 0_hungry -475/132
1 1_full -155/44

Expected return

In [10]:
g = (s0_probs * s_values).sum()
print("expected return at t=0:",  g)
expected return at t=0: -235/66

Improve policy

In [11]:
improvable = ((s_a_probs > 0.) & s_a_values.lt(s_a_values.groupby("state").max())).any()
print("policy can be improved:", improvable)

improved_s_actions = s_a_values.unstack(level="action").astype(float).idxmax()
improved_s_actions.index.name = "state"
print("improved policy:")
display(improved_s_actions.to_frame("action").reset_index())
policy can be improved: True
improved policy:
state action
0 0_ignore 1_full
1 1_feed 1_full

Visitation Frequency¶

Calculate visitation frequency

In [12]:
s0_prob_vector = Matrix(s0_probs)
s1_s_prob_matrix = Matrix(s_s1_probs.unstack(level="next_state").values).T
s1_s_dinv_matrix = (eye(state_count) - gamma * s1_s_prob_matrix).inv()
s_freq_vector = s1_s_dinv_matrix * s0_prob_vector
s_freqs = pd.Series({state: freq for state, freq in zip(states, s_freq_vector)}, index=states)
print("state visitation frequencies:")
display(s_freqs.to_frame("freq").reset_index())

s_a_freqs = s_a_probs * s_freqs
print("state-action visitation frequencies:")
display(s_a_freqs.to_frame("freq").reset_index())
state visitation frequencies:
state freq
0 0_hungry 30/11
1 1_full 25/11
state-action visitation frequencies:
state action freq
0 0_hungry 0_ignore 15/22
1 0_hungry 1_feed 45/22
2 1_full 0_ignore 125/66
3 1_full 1_feed 25/66
In [13]:
s0_a0_prob_vector = Matrix(s0_a0_probs)
s1_a1_s_a_prob_matrix = Matrix(s_a_s1_a1_probs.unstack(level=["next_state", "next_action"]).values).T
s1_a1_s_a_dinv_matrix = (eye(state_action_count) - gamma * s1_a1_s_a_prob_matrix).inv()
s_a_freq_vector = s1_a1_s_a_dinv_matrix * s0_a0_prob_vector
s_a_freqs = pd.Series({state_action: freq for state_action, freq in zip(state_actions, s_a_freq_vector)}, index=state_actions)
print("state-action visitation frequencies:")
display(s_a_freqs.to_frame("freq").reset_index())

s_freqs = s_a_freqs.groupby("state").sum()
print("state visitation frequencies:")
display(s_freqs.to_frame("freq").reset_index())
state-action visitation frequencies:
state action freq
0 0_hungry 0_ignore 15/22
1 0_hungry 1_feed 45/22
2 1_full 0_ignore 125/66
3 1_full 1_feed 25/66
state visitation frequencies:
state freq
0 0_hungry 30/11
1 1_full 25/11

Expected return

In [14]:
g = (s_a_freqs * s_a_rewards).sum()
print("expected return at t=0:",  g)
expected return at t=0: -235/66

Optimal Value and Optimal Policy¶

Optimal values

In [15]:
s_a_s1_eye = deepcopy(s_a_s1_probs) * 0.
s_a_s1_eye.loc[s_a_s1_eye.index.get_level_values("state") == s_a_s1_eye.index.get_level_values("next_state")] = 1
A = (s_a_s1_eye - gamma * s_a_s1_probs).unstack(level="next_state")
b = s_a_rewards
res = linprog(c=s0_probs, A_ub=-A.values, b_ub=-b.values)
s_optvalues = pd.Series([Rational(x).limit_denominator(1000) for x in res.x], index=states)

print("optimal state values:")
display(s_optvalues.to_frame("optvalue").reset_index())

s1_optvalues = deepcopy(s_optvalues)
s1_optvalues.index.name = "next_state"
s_a_optvalues = s_a_rewards + gamma * (s_a_s1_probs * s1_optvalues).groupby(["state", "action"]).sum()
print("optimal action values:")
display(s_a_optvalues.to_frame("optvalue").reset_index())
optimal state values:
state optvalue
0 0_hungry 35/11
1 1_full 5
optimal action values:
state action optvalue
0 0_hungry 0_ignore 6/11
1 0_hungry 1_feed 35/11
2 1_full 0_ignore 21/11
3 1_full 1_feed 5

Optimal policy

In [16]:
s_optactions = s_a_optvalues.unstack(level="action").astype(float).idxmax(axis=1)
print("optimal policy:")
display(s_optactions.to_frame("optaction").reset_index())
optimal policy:
state optaction
0 0_hungry 1_feed
1 1_full 1_feed

Optimal expected return

In [17]:
optg = (s0_probs * s_optvalues).sum()
print("optimal expected return at t=0:", optg)
optimal expected return at t=0: 45/11