# Reinforcement Learning I Model Based Approach
Author: @NanoStar030
Update: 2024/04/03
###### tags: `RL`
## Policy Evaluation
Calculate State value $v_\pi$ according to the given policy $\pi$
## Reinforcement Learning

## Markov Decision Process (MDP)
- State $S$
- Action $A$
- Reward $R$
- Transitioning Matrix $P$
- Policy $\pi$: Describe how Environment output the State according to the given a.
- Return $G$: Collect all Reward in each step.
- Value $V$: Give Return a value to determind the action is good or bad.
1. State Value Iteration
~~~ python
### --- State Value Iteration --- ###
def StateValueIteration(V, P, ER, discount, k, epsilon=-1, h=100):
v = ER.T + discount * P.dot(V).T
vmax, varg = v.max(axis=0), v.argmax(axis=0)
print(k, vmax, varg)
if k >= h or epsilon >= np.max(np.abs(vmax- V)): return
StateValueIteration(vmax, P, ER, discount, k+1, epsilon, h)
~~~
2. Q - Value Iteration
~~~ python
### --- Q Value Iteration --- ###
def Q_ValueIteration(Q, P, ER, discount, k, epsilon=-1, h=100):
qmax = Q.max(axis=0)
q = ER.T + discount * P.dot(qmax).T
print(k, q)
if k >= h or epsilon >= np.max(np.abs(q- Q)): return
Q_ValueIteration(q, P, ER, discount, k+1, epsilon, h)
~~~
3. Policy Iteration
~~~ python
### --- Policy Iteration --- ###
def policyEvaluation(policy, P, R, discount, epsilon):
V = np.zeros(len(R))
while True:
delta = 0
for s in range(len(R)):
v = V[s]
V[s] = sum([P[s, policy[s], s1] * (R[s, policy[s]] + discount * V[s1]) for s1 in range(len(R))])
delta = max(delta, abs(v - V[s]))
if delta < epsilon:
break
return V
def policyImprovement(policy, V, P, R, discount):
policy_stable = True
for s in range(len(R)):
old_action = policy[s]
policy[s] = np.argmax([sum([P[s, a, s1] * (R[s, a] + discount * V[s1]) for s1 in range(len(R))]) for a in range(2)])
if old_action != policy[s]:
policy_stable = False
return policy, policy_stable
def PolicyIteration(P, R, discount, epsilon):
policy = np.zeros(len(R), dtype=int)
while True:
V = policyEvaluation(policy, P, R, discount, epsilon)
policy, policy_stable = policyImprovement(policy, V, P, R, discount)
print(V, policy) #
if policy_stable:
break
return V, policy
~~~
4. Define Some Input
~~~python
# S: States, A: Actions
S = ["s1", "s2", "s3"]
A = ["slow", "fast"]
# P: Transition Matrix
P = np.zeros((len(S), len(A), len(S)), dtype=float)
# R: Immediate Reward, ER: Expected Reward
R = np.zeros((len(S), len(A)), dtype=float)
ER = np.zeros((len(S), len(A)) ,dtype=float)
# Define the R and P of Example 2
R[:, 0], R[:, 1] = [1, 1, 0], [2, 2, -10]
P[:, 0, :] = [[1, 0, 0], [0.5, 0.5, 0], [0, 0, 0]]
P[:, 1, :] = [[0.5, 0.5, 0], [0, 0, 1], [0, 0, 0]]
# Calculate Expected Reward
for a in range(len(A)):
ER[:, a] = P[:, a, :].dot(R[:, a].T)
~~~
~~~ python
# Run StateValueIteration
V0 = np.array([0, 0, 0])
StateValueIteration(V0, P, ER, 0.9, 1, epsilon=0.01)
# Run Q-ValueIteration
Q0 = np.array([[0, 0, 0], [0, 0, 0]])
Q_ValueIteration(Q0, P, ER, 0.9, 1, epsilon=0.01)
V, policy = PolicyIteration(P, R, 0.9, 0.01)
print(V, policy)
~~~