# 【Hung-yi Lee 機器學習 - L2 : Phoneme Classification(Classification) 】
:::info
- 參考 [2021 Spring](https://speech.ee.ntu.edu.tw/~hylee/ml/2021-spring.php)、[2022 Spring](https://speech.ee.ntu.edu.tw/~hylee/ml/2022-spring.php)、[2023 Spring](https://speech.ee.ntu.edu.tw/~hylee/ml/2023-spring.php)、[2024 Spring](https://speech.ee.ntu.edu.tw/~hylee/genai/2024-spring.php)、[2025](https://speech.ee.ntu.edu.tw/~hylee/ml/2025-spring.php)
- Lecture 2 : Deep Learing Introduction
- 分類器 — 淺談機器學習原理
- HW 2-1 : Phoneme Classification
:::
<br/>
## Lecture 2 : Deep Learing Introduction
### [【機器學習 2022】再探寶可夢、數碼寶貝分類器 — 淺談機器學習原理](https://www.youtube.com/watch?v=_j9MVVcvyZI)
### [ML Lecture 4: Classification](https://www.youtube.com/watch?v=fZAZUYEeIMg)
### [ML Lecture 5: Logistic Regression](https://www.youtube.com/watch?v=hSXFuypLukA)
### [ML Lecture 6: Brief Introduction of Deep Learning](https://www.youtube.com/watch?v=Dr-WRlEFefw)
為什麼要用分類?而非線性代數?

線性代數會找到平均Loss最小的,反而紫色線的結果會<綠色線

第一步: 猜測符合raw_data的函式

第二步: 定義 Gaussian distribution,找到最大可能性
假設有一個 Gaussian distribution 可以找到同一個 class 所有點的 mean µ


帶入新的x,算出新的機率
目的是尋找最大的可能性,越接近的mean µ,越有可能為該 class




假設 水系寶可夢61隻、一般系寶可夢79隻


為了要減少參數(參數多,越有可能造成 overfitting),因此可以共用 covariance matrix



<br/>
## HW 2-1 : Phoneme Classification
使用音檔轉換好的vec+音素label,判斷測試集的音素label



多類別分類(multiclass classification),目標是利用深度神經網路(DNN,適合用於 MFCC) 訓練一個語音分類器
```=
def get_device():
return 'cuda' if torch.cuda.is_available() else 'cpu'
def same_seeds(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False # 讓 PyTorch 不去自動尋找最優的計算方式
torch.backends.cudnn.deterministic = True # 確保 cuDNN 使用確定性演算法,避免某些運算(如捲積層)產生不穩定的隨機變化
def predict(test_loader, model, device):
model.eval() # 設置模型為評估模式
predict = [] # 用來存儲預測結果
with torch.no_grad(): # 不計算梯度
for i, data in enumerate(test_loader):
inputs = data
inputs = inputs.to(device)
outputs = model(inputs)
_, test_pred = torch.max(outputs, 1) # 取得預測的類別
for y in test_pred.cpu().numpy():
predict.append(y)
return predict
```
```=
# 資料集
# 處理因變數、自變數
import torch
from torch.utils.data import Dataset
class TIMITDataset(Dataset):
def __init__(self, X, y=None):
self.data = torch.from_numpy(X).float() # 轉換為 FloatTensor
if y is not None:
y = y.astype(int)
self.label = torch.LongTensor(y) # 訓練集
else:
self.label = None # 測試集
def __getitem__(self, idx): # 取得單筆數據
if self.label is not None:
return self.data[idx], self.label[idx] # 回傳 (特徵, 標籤)
else:
return self.data[idx] # 測試集只回傳特徵
def __len__(self):
return len(self.data)
```
```=
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class Classifier(nn.Module):
def __init__(self):
super(Classifier, self).__init__()
self.layer1 = nn.Linear(429, 1024)
self.layer2 = nn.Linear(1024, 512)
self.layer3 = nn.Linear(512, 128)
self.out = nn.Linear(128, 39)
self.act_fn = nn.ReLU() # nn.Sigmoid()
# def forward(self, x):
# x = self.layer1(x)
# x = self.act_fn(x)
# x = self.layer2(x)
# x = self.act_fn(x)
# x = self.layer3(x)
# x = self.act_fn(x)
# x = self.out(x)
# return x
def forward(self, x):
x = self.act_fn(self.layer1(x))
x = self.act_fn(self.layer2(x))
x = self.act_fn(self.layer3(x))
x = self.out(x)
return F.log_softmax(x, dim=1) # 適用於分類問題
```
總共有三個檔案
train_11.npy: training data 語音特徵,已經提取MFCC
train_label_11.npy: training label 對應的音素類別
test_11.npy: testing data (x)
```=
!gdown --id '1HPkcmQmFGu-3OknddKIa5dNDsR05lIQR' --output data.zip
!unzip data.zip
!ls
```

```=
import numpy as np
print('Loading data ...')
data_root='./timit_11/'
train_ori = np.load(data_root + 'train_11.npy')
train_label_ori = np.load(data_root + 'train_label_11.npy')
test = np.load(data_root + 'test_11.npy')
print('Size of training data: {}'.format(train_ori.shape))
print('Size of testing data: {}'.format(test.shape))
```

```=
# 查看數據類型
print(f"Train Data Type: {train_ori.dtype}")
print(f"Train Label Type: {train_label_ori.dtype}")
print(f"Test Data Type: {test.dtype}")
# 查看部分數據(前 5 筆)
print("\nTrain Data Sample:\n", train_ori[:5])
print("\nTrain Labels Sample:\n", train_label_ori[:5])
print("\nTest Data Sample:\n", test[:5])
```

查看音速類別有幾種
```=
import numpy as np
labels = np.load(".../colab_ml/timit_11/train_label_11.npy")
print(np.unique(labels))
```

因為載的dataset沒有test的解答,這裡不使用
改成先切出 10% test data,剩下再分 80% train data + 20% vaild data
```=
import numpy as np
np.random.seed(77777)
# train_ori 和 train_label 分別是音檔的特徵和標籤
# 先把 10% 切出作為測試數據
TEST_RATIO = 0.1
data = list(zip(train_ori, train_label_ori))
np.random.shuffle(data)
train_ori_shuffled, train_label_ori_shuffled = zip(*data)
train_ori_shuffled = np.array(train_ori_shuffled)
train_label_ori_shuffled = np.array(train_label_ori_shuffled)
train_size = int(train_ori_shuffled.shape[0] * (1 - TEST_RATIO))
train = train_ori_shuffled[:train_size] # 90% 訓練資料
train_label = train_label_ori_shuffled[:train_size] # 90% 訓練標籤
test = train_ori_shuffled[train_size:] # 10% 測試資料
test_label = train_label_ori_shuffled[train_size:] # 10% 測試標籤
print("\nTrain Data Sample:\n", train[:5])
print("\nTrain Labels Sample:\n", train_label[:5])
print("\nTest Data Sample:\n", test[:5])
print("\nTest Labels Sample:\n", test_label[:5])
# 10%的正確結果先存
import pandas as pd
test_label_df = pd.DataFrame(test_label, columns=['label'])
test_label_df.to_csv('test_data_y_hat.csv', index=False)
```

```=
# 剩下再分 80% train data + 20% vaild data
VAL_RATIO = 0.2
percent = int(train.shape[0] * (1 - VAL_RATIO))
train_x, train_y, val_x, val_y = train[:percent], train_label[:percent], train[percent:], train_label[percent:]
print('Size of training set: {}'.format(train_x.shape))
print('Size of validation set: {}'.format(val_x.shape))
```

```=
BATCH_SIZE = 64
from torch.utils.data import DataLoader
train_set = TIMITDataset(train_x, train_y)
val_set = TIMITDataset(val_x, val_y)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True) #only shuffle the training data
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, shuffle=False)
```
```=
# 引入 garbage collection 模組,釋放未使用的記憶體
import gc
del train, train_label, train_x, train_y, val_x, val_y
gc.collect()
```
```=
same_seeds(0)
device = get_device()
print(f'DEVICE: {device}')
num_epoch = 50
learning_rate = 0.0001 # learning rate
model_path = './model.ckpt'
model = Classifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
```
```=
# training
early_stop_count = 0
best_acc = 0.0
for epoch in range(num_epoch):
train_acc = 0.0
train_loss = 0.0
val_acc = 0.0
val_loss = 0.0
# training
model.train()
for i, data in enumerate(train_loader):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
batch_loss = criterion(outputs, labels)
_, train_pred = torch.max(outputs, 1) # 取得最高機率的預測類別 # 代表最大值本身(不需要使用,所以用 _ 忽略)
batch_loss.backward()
optimizer.step()
# 累加訓練準確率與訓練損失
train_acc += (train_pred.cpu() == labels.cpu()).sum().item()
train_loss += batch_loss.item()
# validation
if len(val_set) > 0:
model.eval()
with torch.no_grad():
for i, data in enumerate(val_loader):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
batch_loss = criterion(outputs, labels)
_, val_pred = torch.max(outputs, 1)
val_acc += (val_pred.cpu() == labels.cpu()).sum().item()
val_loss += batch_loss.item()
# {:03d} : 3 位數整數,不足補 0(例如 001)
# {:3.6f} : 顯示 6 位小數點的浮點數
print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f} | Val Acc: {:3.6f} loss: {:3.6f}'.format(
epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader), val_acc/len(val_set), val_loss/len(val_loader)
))
# 如果驗證損失數值較低,保存數據,否則 停止的數字+1
if val_acc > best_acc:
best_acc = val_acc
torch.save(model.state_dict(), model_path)
print('saving model with acc {:.3f}'.format(best_acc/len(val_set)))
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= 400:
print('\nEarly stop')
break
else:
print('[{:03d}/{:03d}] Train Acc: {:3.6f} Loss: {:3.6f}'.format(
epoch + 1, num_epoch, train_acc/len(train_set), train_loss/len(train_loader)
))
# if not validating, save the last epoch
if len(val_set) == 0:
torch.save(model.state_dict(), model_path)
print('saving model at last epoch')
```

測試並存檔
```=
# testing
test_set = TIMITDataset(test, None)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
model = Classifier().to(device)
model.load_state_dict(torch.load(model_path))
```
```=
preds = predict(test_loader, model, device)
pd.DataFrame(preds, columns=['Predictions']).to_csv('prediction.csv', index=False)
```
```=
from sklearn.metrics import accuracy_score, confusion_matrix
# 檔案對檔案對比
# 預測結果
pred_df = pd.read_csv('prediction.csv')
preds = pred_df['Predictions'].values
# 真實標籤
test_data_y_hat_df = pd.read_csv('test_data_y_hat.csv')
test_data_y_hat = test_data_y_hat_df['label'].values
# 計算準確率
accuracy = accuracy_score(test_data_y_hat, preds)
print(f'Accuracy: {accuracy:.4f}')
# 計算混淆矩陣
cm = confusion_matrix(test_data_y_hat, preds)
print('Confusion Matrix:')
print(cm)
```
