# CJSHS AI Course Week 5
## 之前的課程連結
- Week 1: https://hackmd.io/@Titi/ByNUY-x2kx
- Week 2: https://hackmd.io/@Titi/S18ym5M6ke
- Week 3: https://hackmd.io/@Titi/H1XIq-SAJl
## Week 4 課程
## Reinforcement learning
- example:
- https://youtu.be/VMp6pq6_QjI?si=GD4EpytJVoUd-36x

### 開啟 Colab
- https://colab.research.google.com/
- 設定使用GPU運算
1. 執行階段
2. 變更執行階段類型
3. 選擇 "T4 GPU"
4. 儲存
### 建立遊戲
```python=
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import random
class MazeEnv(gym.Env):
"""
6x6 固定迷宮:S=起點(1,1) G=終點(4,4)
牆=1 空地=0
Reward: 到 G +1, 撞牆 -0.2, 其餘 -0.01
Episode 最長 100 步
Observation: agent (x,y) 座標,已做 0~1 正規化
Action: 0=上 1=下 2=左 3=右
"""
metadata = {"render_modes": ["human"]}
def __init__(self, render_mode=None):
super().__init__()
self.maze = np.array(
[[0,0,0,0,0,0],
[0,0,1,1,1,0],
[0,0,0,0,1,0],
[1,1,0,0,1,0],
[0,0,0,1,0,0],
[0,1,0,0,0,2]], dtype=np.int8) # 2 = Goal
self.start_pos = (0,0)
self.goal_pos = (5,5)
self.max_steps = 100
self.action_space = spaces.Discrete(4)
self.observation_space = spaces.Box(low=0.0, high=1.0,
shape=(2,), dtype=np.float32)
self.render_mode = render_mode
self.reset()
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.agent_pos = list(self.start_pos)
self.steps = 0
return self._get_obs(), {}
def step(self, action):
self.steps += 1
x,y = self.agent_pos
if action == 0: ny, nx = y-1, x
elif action == 1: ny, nx = y+1, x
elif action == 2: ny, nx = y, x-1
else: ny, nx = y, x+1
reward = -0.01
done = False
if not self._out_of_bounds(nx, ny) and self.maze[ny, nx] != 1:
self.agent_pos = [nx, ny]
else:
reward -= 0.19 # 撞牆額外扣分
if tuple(self.agent_pos) == self.goal_pos:
reward = 1.0
done = True
elif self.steps >= self.max_steps:
done = True
if self.render_mode == "human":
self.render()
return self._get_obs(), reward, done, False, {}
# ----- 輔助函式 -----
def _get_obs(self):
# (x,y) 轉為 0~1
return np.array(self.agent_pos, dtype=np.float32) / (self.maze.shape[0]-1)
def _out_of_bounds(self, x, y):
s = self.maze.shape[0]
return x < 0 or y < 0 or x >= s or y >= s
def render(self, mode="human"):
grid = np.copy(self.maze)
x, y = self.agent_pos
grid[y, x] = 3 # 3 = Agent
if mode == "human":
symbols = {0:' ', 1:'#', 2:'G', 3:'A'}
print("\n".join("".join(symbols[c] for c in row) for row in grid))
print("-"*10)
elif mode == "rgb_array":
# 4×4 像素的小格子放大成 20×20 方便看
h, w = grid.shape
palette = {
0: (240, 240, 240), # 空地
1: (60, 60, 60), # 牆
2: (0, 200, 0), # Goal
3: (200, 20, 20) # Agent
}
canvas = np.zeros((h, w, 3), dtype=np.uint8)
for yy in range(h):
for xx in range(w):
canvas[yy, xx] = palette[int(grid[yy, xx])]
# 放大 20 倍:每格 20×20 像素
return np.kron(canvas, np.ones((20, 20, 1), dtype=np.uint8))
else:
raise NotImplementedError
```
### 建立AI模型
```python=
import torch, torch.nn as nn, torch.optim as optim
from collections import deque
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt
from tqdm.auto import trange
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class QNet(nn.Module):
def __init__(self, in_dim, n_actions):
super().__init__()
self.net = nn.Sequential(
nn.Linear(in_dim, 64), nn.ReLU(),
nn.Linear(64, 64), nn.ReLU(),
nn.Linear(64, n_actions))
def forward(self, x): return self.net(x)
class ReplayBuf:
def __init__(self, cap=50_000): self.buf = deque(maxlen=cap)
def push(self,*t): self.buf.append(tuple(map(np.array,t)))
def sample(self,b):
s,a,r,ns,d = map(np.array, zip(*random.sample(self.buf,b)))
return (torch.tensor(s, dtype=torch.float32),
torch.tensor(a, dtype=torch.int64).unsqueeze(1),
torch.tensor(r, dtype=torch.float32).unsqueeze(1),
torch.tensor(ns, dtype=torch.float32),
torch.tensor(d, dtype=torch.float32).unsqueeze(1))
def __len__(self): return len(self.buf)
```
### 定義繪圖函式
```python=
import imageio
from IPython.display import Image, display
def play_episode(env, policy_net, max_steps=200):
"""回傳 total_reward 與 frames (list of RGB ndarray)"""
state, _ = env.reset()
done = False
total_r = 0.0
frames = [env.render(mode="rgb_array")]
while not done and len(frames) < max_steps:
with torch.no_grad():
action = int(torch.argmax(policy_net(torch.tensor(state, device=device))).item())
state, reward, done, _, _ = env.step(action)
total_r += reward
frames.append(env.render(mode="rgb_array"))
return total_r, frames
```
### 開始訓練模型,並繪製訓練過程
```python=
import os, imageio, uuid
EVAL_INTERVAL = 100 # 每 100 回合做一次
os.makedirs("eval_gifs", exist_ok=True)
env = MazeEnv()
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
q_net = QNet(state_dim, action_dim).to(device)
target_net = QNet(state_dim, action_dim).to(device)
target_net.load_state_dict(q_net.state_dict())
optimiser = optim.Adam(q_net.parameters(), lr=1e-3)
criterion = nn.MSELoss()
buf = ReplayBuf()
BATCH = 128
GAMMA = 0.99
EPS_START, EPS_END, EPS_DECAY = 1.0, 0.05, 500
SYNC = 20
EPISODES = 1000
def eps_by_ep(e): return EPS_END+(EPS_START-EPS_END)*np.exp(-e/EPS_DECAY)
writer, rewards = SummaryWriter(), []
for ep in trange(EPISODES):
s,_ = env.reset()
ep_r=0; done=False; eps=eps_by_ep(ep)
while not done:
act = random.randrange(action_dim) if random.random()<eps else \
torch.argmax(q_net(torch.tensor(s).to(device))).item()
ns,r,done,_,_ = env.step(act)
buf.push(s,act,r,ns,done)
s=ns; ep_r+=r
if len(buf)>=BATCH:
S,A,R,NS,D = [t.to(device) for t in buf.sample(BATCH)]
q = q_net(S).gather(1,A)
with torch.no_grad():
qn = target_net(NS).max(1,keepdim=True)[0]
tgt = R + GAMMA*qn*(1-D)
loss = criterion(q, tgt)
optimiser.zero_grad(); loss.backward(); optimiser.step()
rewards.append(ep_r)
writer.add_scalar("Reward", ep_r, ep)
writer.add_scalar("Epsilon", eps, ep)
if ep%SYNC==0: target_net.load_state_dict(q_net.state_dict())
# -------- 每 100 ep 評估一次 --------
if (ep + 1) % EVAL_INTERVAL == 0:
eval_env = MazeEnv(render_mode="rgb_array") # 新開一個乾淨環境
eval_r, frames = play_episode(eval_env, q_net)
gif_path = f"eval_gifs/maze_ep{ep+1}_{uuid.uuid4().hex[:6]}.gif"
imageio.mimsave(gif_path, frames, fps=4)
writer.add_scalar("Eval/Reward", eval_r, ep + 1) # TensorBoard
# 立即在 Colab 畫面顯示
display(Image(filename=gif_path))
print("---")
writer.close()
```
### 監看訓練過程
```python=
%load_ext tensorboard
%tensorboard --logdir runs
```
## 作業
1. 成功執行以上程式碼。
2. 建立長榮中學GPT資料庫
- https://hackmd.io/@Titi/HJpsfnM6kg