# CJSHS AI Course Week 5 ## 之前的課程連結 - Week 1: https://hackmd.io/@Titi/ByNUY-x2kx - Week 2: https://hackmd.io/@Titi/S18ym5M6ke - Week 3: https://hackmd.io/@Titi/H1XIq-SAJl ## Week 4 課程 ## Reinforcement learning - example: - https://youtu.be/VMp6pq6_QjI?si=GD4EpytJVoUd-36x ![Reinforcement_learning_diagram](https://hackmd.io/_uploads/r1RhlIp-ex.svg) ### 開啟 Colab - https://colab.research.google.com/ - 設定使用GPU運算 1. 執行階段 2. 變更執行階段類型 3. 選擇 "T4 GPU" 4. 儲存 ### 建立遊戲 ```python= import gymnasium as gym from gymnasium import spaces import numpy as np import random class MazeEnv(gym.Env): """ 6x6 固定迷宮:S=起點(1,1) G=終點(4,4) 牆=1 空地=0 Reward: 到 G +1, 撞牆 -0.2, 其餘 -0.01 Episode 最長 100 步 Observation: agent (x,y) 座標,已做 0~1 正規化 Action: 0=上 1=下 2=左 3=右 """ metadata = {"render_modes": ["human"]} def __init__(self, render_mode=None): super().__init__() self.maze = np.array( [[0,0,0,0,0,0], [0,0,1,1,1,0], [0,0,0,0,1,0], [1,1,0,0,1,0], [0,0,0,1,0,0], [0,1,0,0,0,2]], dtype=np.int8) # 2 = Goal self.start_pos = (0,0) self.goal_pos = (5,5) self.max_steps = 100 self.action_space = spaces.Discrete(4) self.observation_space = spaces.Box(low=0.0, high=1.0, shape=(2,), dtype=np.float32) self.render_mode = render_mode self.reset() def reset(self, seed=None, options=None): super().reset(seed=seed) self.agent_pos = list(self.start_pos) self.steps = 0 return self._get_obs(), {} def step(self, action): self.steps += 1 x,y = self.agent_pos if action == 0: ny, nx = y-1, x elif action == 1: ny, nx = y+1, x elif action == 2: ny, nx = y, x-1 else: ny, nx = y, x+1 reward = -0.01 done = False if not self._out_of_bounds(nx, ny) and self.maze[ny, nx] != 1: self.agent_pos = [nx, ny] else: reward -= 0.19 # 撞牆額外扣分 if tuple(self.agent_pos) == self.goal_pos: reward = 1.0 done = True elif self.steps >= self.max_steps: done = True if self.render_mode == "human": self.render() return self._get_obs(), reward, done, False, {} # ----- 輔助函式 ----- def _get_obs(self): # (x,y) 轉為 0~1 return np.array(self.agent_pos, dtype=np.float32) / (self.maze.shape[0]-1) def _out_of_bounds(self, x, y): s = self.maze.shape[0] return x < 0 or y < 0 or x >= s or y >= s def render(self, mode="human"): grid = np.copy(self.maze) x, y = self.agent_pos grid[y, x] = 3 # 3 = Agent if mode == "human": symbols = {0:' ', 1:'#', 2:'G', 3:'A'} print("\n".join("".join(symbols[c] for c in row) for row in grid)) print("-"*10) elif mode == "rgb_array": # 4×4 像素的小格子放大成 20×20 方便看 h, w = grid.shape palette = { 0: (240, 240, 240), # 空地 1: (60, 60, 60), # 牆 2: (0, 200, 0), # Goal 3: (200, 20, 20) # Agent } canvas = np.zeros((h, w, 3), dtype=np.uint8) for yy in range(h): for xx in range(w): canvas[yy, xx] = palette[int(grid[yy, xx])] # 放大 20 倍:每格 20×20 像素 return np.kron(canvas, np.ones((20, 20, 1), dtype=np.uint8)) else: raise NotImplementedError ``` ### 建立AI模型 ```python= import torch, torch.nn as nn, torch.optim as optim from collections import deque from torch.utils.tensorboard import SummaryWriter import matplotlib.pyplot as plt from tqdm.auto import trange device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class QNet(nn.Module): def __init__(self, in_dim, n_actions): super().__init__() self.net = nn.Sequential( nn.Linear(in_dim, 64), nn.ReLU(), nn.Linear(64, 64), nn.ReLU(), nn.Linear(64, n_actions)) def forward(self, x): return self.net(x) class ReplayBuf: def __init__(self, cap=50_000): self.buf = deque(maxlen=cap) def push(self,*t): self.buf.append(tuple(map(np.array,t))) def sample(self,b): s,a,r,ns,d = map(np.array, zip(*random.sample(self.buf,b))) return (torch.tensor(s, dtype=torch.float32), torch.tensor(a, dtype=torch.int64).unsqueeze(1), torch.tensor(r, dtype=torch.float32).unsqueeze(1), torch.tensor(ns, dtype=torch.float32), torch.tensor(d, dtype=torch.float32).unsqueeze(1)) def __len__(self): return len(self.buf) ``` ### 定義繪圖函式 ```python= import imageio from IPython.display import Image, display def play_episode(env, policy_net, max_steps=200): """回傳 total_reward 與 frames (list of RGB ndarray)""" state, _ = env.reset() done = False total_r = 0.0 frames = [env.render(mode="rgb_array")] while not done and len(frames) < max_steps: with torch.no_grad(): action = int(torch.argmax(policy_net(torch.tensor(state, device=device))).item()) state, reward, done, _, _ = env.step(action) total_r += reward frames.append(env.render(mode="rgb_array")) return total_r, frames ``` ### 開始訓練模型,並繪製訓練過程 ```python= import os, imageio, uuid EVAL_INTERVAL = 100 # 每 100 回合做一次 os.makedirs("eval_gifs", exist_ok=True) env = MazeEnv() state_dim = env.observation_space.shape[0] action_dim = env.action_space.n q_net = QNet(state_dim, action_dim).to(device) target_net = QNet(state_dim, action_dim).to(device) target_net.load_state_dict(q_net.state_dict()) optimiser = optim.Adam(q_net.parameters(), lr=1e-3) criterion = nn.MSELoss() buf = ReplayBuf() BATCH = 128 GAMMA = 0.99 EPS_START, EPS_END, EPS_DECAY = 1.0, 0.05, 500 SYNC = 20 EPISODES = 1000 def eps_by_ep(e): return EPS_END+(EPS_START-EPS_END)*np.exp(-e/EPS_DECAY) writer, rewards = SummaryWriter(), [] for ep in trange(EPISODES): s,_ = env.reset() ep_r=0; done=False; eps=eps_by_ep(ep) while not done: act = random.randrange(action_dim) if random.random()<eps else \ torch.argmax(q_net(torch.tensor(s).to(device))).item() ns,r,done,_,_ = env.step(act) buf.push(s,act,r,ns,done) s=ns; ep_r+=r if len(buf)>=BATCH: S,A,R,NS,D = [t.to(device) for t in buf.sample(BATCH)] q = q_net(S).gather(1,A) with torch.no_grad(): qn = target_net(NS).max(1,keepdim=True)[0] tgt = R + GAMMA*qn*(1-D) loss = criterion(q, tgt) optimiser.zero_grad(); loss.backward(); optimiser.step() rewards.append(ep_r) writer.add_scalar("Reward", ep_r, ep) writer.add_scalar("Epsilon", eps, ep) if ep%SYNC==0: target_net.load_state_dict(q_net.state_dict()) # -------- 每 100 ep 評估一次 -------- if (ep + 1) % EVAL_INTERVAL == 0: eval_env = MazeEnv(render_mode="rgb_array") # 新開一個乾淨環境 eval_r, frames = play_episode(eval_env, q_net) gif_path = f"eval_gifs/maze_ep{ep+1}_{uuid.uuid4().hex[:6]}.gif" imageio.mimsave(gif_path, frames, fps=4) writer.add_scalar("Eval/Reward", eval_r, ep + 1) # TensorBoard # 立即在 Colab 畫面顯示 display(Image(filename=gif_path)) print("---") writer.close() ``` ### 監看訓練過程 ```python= %load_ext tensorboard %tensorboard --logdir runs ``` ## 作業 1. 成功執行以上程式碼。 2. 建立長榮中學GPT資料庫 - https://hackmd.io/@Titi/HJpsfnM6kg