# ML_HW4
**(1)[c]**
$$
\begin{align}
w(t+1) &= w(t) - \eta \nabla E_{aug}(w(t))\\
&= w(t) - \eta \nabla (E_{in}(w(t)) + \frac{\lambda}{N}w^T(t)w(t))\\
&= w(t) - \eta \nabla E_{in}(w(t)) - \frac{\eta \lambda}{N} \nabla w^T(t)w(t)\\
&= w(t) - \eta \nabla E_{in}(w(t)) - \frac{\eta \lambda}{N} \times 2 \times w(t)\\
&= (1 - \frac{2 \eta \lambda}{N})w(t) - \eta E_{in}(w(t))
\end{align}
$$
**(2)[b]**
$$
\rm{To \ find \ min \ } \frac{1}{N} \sum^N_{n=1} (w-y_n)^2 + \frac{\lambda}{N} w^2\\
\Rightarrow \frac{d}{dw} (\frac{1}{N} \sum^N_{n=1} (w-y_n)^2 + \frac{\lambda}{N} w^2) = 0\\
\frac{1}{N} \sum^N_{n=1} \nabla (w-y_n)^2 + \frac{2 \lambda}{N} w = 0\\
\frac{1}{N} \sum^N_{n=1} 2 \ (w-y_n) + \frac{2 \lambda}{N} w = 0\\
2 w + \frac{2 \lambda}{N} w = \frac{2}{N} \sum^N_{n=1} y_n\\
w = \frac{1}{N + \lambda} \sum^N_{n=1} y_n\\
C = (w^*)^2 = (\frac{\sum^N_{n=1} y_n}{N + \lambda})^2
$$
**(3)[d]**
$$
\tilde{w}^T \Phi (x_n) = \tilde{w}^T V x_n\\
{\rm Let \ } \tilde{w}^T V = w^T \rightarrow \tilde{w}^T = w^T V^{-1}\\
\min\limits_{\tilde{w} \in \mathbb{R} ^ {d+1}} \frac{1}{N} \sum^N_{n=1} (\tilde{w}^T \Phi (x_n) - y_n)^2 + \frac{\lambda}{N} (\tilde{w}^T \tilde{w})\\
= \min\limits_{\tilde{w} \in \mathbb{R} ^ {d+1}} \frac{1}{N} \sum^N_{n=1} (\tilde{w}^T V x_n - y_n)^2 + \frac{\lambda}{N} (w^T V^{-1} (w^T V^{-1})^T)\\
= \min\limits_{\tilde{w} \in \mathbb{R} ^ {d+1}} \frac{1}{N} \sum^N_{n=1} (w^T x_n - y_n)^2 + \frac{\lambda}{N} (w^T V^{-1} (V^{-1})^T w)\\
U = V^{-1} (V^{-1})^T\ = (V^{-1})^2\\
$$
**(4)[a]**
$$
\mathbb{E}[\frac{1}{N} \sum^N_{n=1} (w^T \Phi(x_n) - y_n)^2] = \mathbb{E}[\frac{1}{N} \sum^N_{n=1} (w^T x_n + w^T \epsilon - y_n)^2]\\
\mathbb{E}[\frac{1}{N} \sum^N_{n=1} ((w^T x_n - y_n)^2 + (w^T\epsilon)^2 + 2(w^T x_n - y_n)w^T\epsilon)]\\
= \frac{1}{N} \sum^N_{n=1} (w^T x_n - y_n)^2 + \mathbb{E}((w^T\epsilon)^2) + 2w^T\mathbb{E}(\epsilon)\frac{1}{N}\sum^N_{n=1}(w^Tx_n - y_n)\\
= \frac{1}{N} \sum^N_{n=1} (w^T x_n - y_n)^2 + \mathbb{E}(\epsilon^2) ||w||^2\\
\text{which is equivalent to }\frac{1}{N} \sum^N_{n=1} (w^T x_n - y_n)^2 + \frac{\lambda}{N} ||w||^2_2\\
\lambda = N\mathbb{E}(\epsilon^2) = N\sigma^2
$$
**(5)[d]**
$$
\text{To find the optimal solution of } \min\limits_{y \in \mathbb{R}} \frac{1}{N} \sum^N_{n=1} (y - y_n)^2 + \frac{\alpha k}{N} \Omega (y)\\
\text{Let} \frac{d}{dy} (\frac{1}{N} \sum^N_{n=1} (y - y_n)^2 + \frac{\alpha k}{N} \Omega (y)) = 0\\
\frac{1}{N} \sum^N_{n=1} 2(y-y_n) + \frac{\alpha k}{N} \Omega'(y) = 0\\
\frac{\alpha k}{N} \Omega'(y) = \frac{2}{N} \sum^N_{n=1} y_n - 2y\\
\text{we have already known that optimal solution } y = \frac{(\sum^N_{n=1}y_n) + \alpha}{N + \alpha k}\\
\rightarrow \sum^N_{n=1} y_n = y(N + \alpha k) - \alpha\\
\frac{\alpha k}{N} \Omega'(y) = \frac{2(y(N + \alpha k) - \alpha)}{N} -2y\\
\Omega'(y) = \frac{2yN + 2y\alpha k - 2\alpha}{\alpha k} - \frac{2yN}{\alpha k}\\
= 2y - \frac{2}{k}\\
\Omega(y) = (y - \frac{1}{k})^2 + C
$$
**(6)[b]**
$$
\text{find the minimizer of } \hat{E}_{aug}(w) , \text{Let } \nabla \hat{E}_{aug}(w) = 0\\
\nabla (E_{in}(w^*) + \frac{1}{2}(w-w^*)^T H (w-w^*) + \frac{\lambda}{N}||w||^2)\\
=\nabla (E_{in}(w^*) + \frac{1}{2}(w^THw - w^THw^* - (w^*)^THw + (w^*)^THw^*) + \frac{\lambda}{N}||w||^2)\\
=\frac{1}{2}((H+H^T)w - Hw^* - ((w^*)^TH)^T) + \frac{2 \lambda}{N} w\\
=\frac{1}{2}(2Hw - 2Hw^*) + \frac{2 \lambda}{N} w\\
\rightarrow (H + \frac{2\lambda}{N}I)w = Hw^*, w = (H + \frac{2\lambda}{N}I)^{-1}Hw^*
$$
**(7)[a]**
$$
\text{There are only 2 class in the data set, and both of them are N examples.}\\
\text{Every time we choose one example as test data, } \mathcal{A}_{minority} \text{ will also return the same type}\\
\text{So the } e_n = 0
$$
**(8)[c]**
$$
\text{Let } p_1 = (2, 0), \ p_2 = (\rho, 2), \ p_3 = (-2, 0)\\
\ \\
\text{for the constant model :}\\
\mathcal{D}_1 = \{p_1, \ p_2\} \rightarrow h(x) = 1, \ e_1 = 1\\
\mathcal{D}_2 = \{p_2, \ p_3\} \rightarrow h(x) = 1, \ e_2 = 1\\
\mathcal{D}_3 = \{p_1, \ p_3\} \rightarrow h(x) = 0, \ e_3 = 2^2\\
E_{loocv} = \frac{1 + 1 + 4}{3} = 6\\
\text{for the linear model :}\\
$$

$$
\mathcal{D}_1 = \{p_1, \ p_2\} \rightarrow \ e_1 = (\frac{4}{\rho - 2} \times 2)^2\\
\mathcal{D}_2 = \{p_2, \ p_3\} \rightarrow \ e_2 = (\frac{4}{\rho - 2} \times 2)^2\\
\mathcal{D}_3 = \{p_1, \ p_3\} \rightarrow \ e_3 = 2^2\\
\Rightarrow \frac{64}{(\rho + 2)} + \frac{64}{(\rho - 2)^2} + 4 = 6\\
\rho = \pm 2 \sqrt{9 + 4 \sqrt{6}} \approx 8.67
$$
**(9)[b]**
$$
\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} (y_n - \bar{y})^2)\\
=\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} (y_n^2 - 2y_n\bar{y} + \bar{y}^2))\\
=\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} y_n^2) - \mathbb{E}(\frac{2}{K} \sum^N_{n = N - K + 1} y_n\bar{y}) + \mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} \bar{y}^2)\\
\mathbb{E}(y_a y_b) = \mathbb{E}(y_a)\times\mathbb{E}(y_b) \text{ if } y_a \text{ and } y_b \text{ are independence}\\
\begin{align}\\
\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} y_n^2) &= \sigma^2\\
\bar{y} &= \frac{y_1 + y_2 + ... + y_{N-K}}{N - K}\\
\mathbb{E}(\frac{2}{K} \sum^N_{n = N - K + 1} y_n\bar{y}) &= \mathbb{E}(\frac{2}{K} \sum^N_{n = N - K + 1} y_n(y_1 + y_2 + ... + y_{N-K}))\\
&= \frac{2}{K} \sum^N_{n = N - K + 1} \sum^{N-K}_{m = 1}\mathbb{E}(y_ny_m)\\
&= 0\\
\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} \bar{y}^2) &= \mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} \frac{y_1^2 + y_2^2 + ... + (y_{N+K})^2+2(y_1y_2 + y_1y_3 + ... y_{N+K-1}y_{N+K})}{(N-K)^2})\\
&=\frac{1}{K(N-K)^2} \sum^N_{n = N - K + 1} \mathbb{E}(y_1^2 + y_2^2 + ... + (y_{N+K})^2+2(y_1y_2 + y_1y_3 + ... y_{N+K-1}y_{N+K}))\\
&= \frac{1}{K(N-K)^2} \sum^N_{n=N-K+1}(N+K)\sigma^2\\
&= \frac{\sigma ^2}{N - K}\\
\end{align}\\
\mathbb{E}(\frac{1}{K} \sum^N_{n = N - K + 1} (y_n - \bar{y})^2) = \sigma^2+\frac{\sigma^2}{N-K}\\
$$
**(10)[a]**
$$
\text{There are 16 combinations:}\\
\times\times\times\times\\
\circ\times\times\times\\
\times\circ\times\times\\
\times\times\circ\times\\
\times\times\times\circ\\
\circ\circ\times\times\\
\circ\times\circ\times\\
\circ\times\times\circ\\
\times\circ\circ\times\\
\times\circ\times\circ\\
\times\times\circ\circ\\
\times\circ\circ\circ\\
\circ\times\circ\circ\\
\circ\circ\times\circ\\
\circ\circ\circ\times\\
\circ\circ\circ\circ\\
\text{and only two combinitions } E_{in} \neq 0 :\\
\times\circ\times\circ\\
\circ\times\circ\times\\
\text{and their } E_{in} = \frac{1}{4}\\
\mathbb{E}(E_{in}) = \frac{1}{16} \times 2 \times \frac{1}{4}\\
= \frac{1}{32}
$$
**(11)[b]**
$$
\text{if } P(y=+1)=p \rightarrow E_{out}(g) = p \epsilon_+ + (1 - p) \epsilon_-\\
\text{Let } E_{out}(g) = E_{out}(g_-) = p = p \epsilon_+ + (1 - p) \epsilon_-\\
p(1-\epsilon_++\epsilon_-) = \epsilon_-\\
p = \frac{\epsilon_-}{\epsilon_- - \epsilon_+ + 1}
$$
### programing
use python to transform the data.
```python=
from itertools import combinations_with_replacement
import sys
data = sys.stdin.read().split('\n')
for i in data:
d = list(map(float, i.split(' ')))
x = d[0:6]
y = d[6]
if y == 1 :
print("+1", end = ' ')
else :
print("-1", end = ' ')
newx2 = list(combinations_with_replacement(x, 2))
newx3 = list(combinations_with_replacement(x, 3))
print("1:1", end = ' ')
count = 2
for i in x:
print(f"{count}:{i}", end = ' ')
count += 1
for i in newx2:
print(f"{count}:{i[0] * i[1]}", end = ' ')
count += 1
for i in newx3:
print(f"{count}:{i[0] * i[1] * i[2]}", end = ' ')
count += 1
print()
```
$$
\text{套件的公式為} : \min\limits_w \frac{w^Tw}{2} + C \sum log(1 + exp(-y, w^T x_i))\\
將其轉換成上課所使用的公式 : \min\limits_w CN(\frac{w^Tw}{2CN} + \frac{1}{N} \sum log(1 + exp(-y, w^T x_i)))\\
\lambda = \frac{1}{2C},\quad C = \frac{1}{2\lambda}
$$
**(12)[d]**
a.
```
./train -s 0 -e 0.000001 -c 5000 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 87.375% (699/800)
```
b.
```
./train -s 0 -e 0.000001 -c 50 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 87.75% (702/800)
```
c.
```
./train -s 0 -e 0.000001 -c 0.5 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 92% (736/800)
```
d.
```
./train -s 0 -e 0.000001 -c 0.005 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 92.75% (742/800)
```
e.
```
./train -s 0 -e 0.000001 -c 0.00005 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 83.5% (668/800)
```
**(13)[b]**
a.
```
./train -s 0 -e 0.000001 -c 5000 data.txt
./predict data.txt data.txt.model output_file.txt
```
```
Accuracy = 100% (200/200)
```
b.
```
./train -s 0 -e 0.000001 -c 50 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 100% (200/200)
```
c.
```
./train -s 0 -e 0.000001 -c 0.5 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 97.5% (195/200)
```
d.
```
./train -s 0 -e 0.000001 -c 0.005 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 95% (190/200)
```
e.
```
./train -s 0 -e 0.000001 -c 0.00005 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 82% (164/200)
```
**(14)[c]**
split the data
```
head -n 120 < data.txt > traindata.txt
tail -n 80 < data.txt > valdata.txt
```
a.
```
/train -s 0 -e 0.000001 -c 5000 traindata.txt
./predict valdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 77.5% (62/80)
```
b.
```
/train -s 0 -e 0.000001 -c 50 traindata.txt
./predict valdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 80% (64/80)
```
c.
```
/train -s 0 -e 0.000001 -c 0.5 traindata.txt
./predict valdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 88.75% (71/80)
```
d.
```
/train -s 0 -e 0.000001 -c 0.005 traindata.txt
./predict valdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 93.75% (75/80)
```
e.
```
/train -s 0 -e 0.000001 -c 0.00005 traindata.txt
./predict valdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 85% (68/80)
```
use the best one([d]) to test $E_{out}$
```
./train -s 0 -e 0.000001 -c 0.005 traindata.txt
```
```
./predict testdata.txt traindata.txt.model output_file.txt
```
```
Accuracy = 92.5% (740/800)
```
**(15)[d]**
```
./train -s 0 -e 0.000001 -c 0.005 data.txt
./predict testdata.txt data.txt.model output_file.txt
```
```
Accuracy = 92.75% (742/800)
```
**(16)[b]**
use shell script to do validation.
```
#/bin/bash
c=(5000, 50, 0.5, 0.005, 0.00005)
for (( k=0; k<5; k=k+1 )); do
# echo "loop$k"
sum=0
for (( i=0; i<5; i=i+1 )); do
touch tmptrain$i
touch tmptest$i
exec {uid}<> data.txt
data_uid=$uid
for (( j=0; j<200; j++)); do
read -u $data_uid Line
if [[ $j -ge $(($i * 40)) ]] && [[ $j -lt $(($i * 40 + 40)) ]]; then
echo $Line >> tmptest$i
else
echo $Line >> tmptrain$i
fi
done
./train -s 0 -e 0.000001 -c ${c[$k]} tmptrain$i >/dev/null
a=$(./predict tmptest$i tmptrain$i.model output_file.txt | \
sed "s/Accuracy = //g" | sed "s/%.*//g")
sum=$(echo "$a + $sum")
done
sum=$(echo $sum | bc -l)
echo "100 - $sum / 5" | bc -l
rm -f tmp*
done
```
```
19.00000000000000000000
17.00000000000000000000
11.50000000000000000000
8.00000000000000000000
21.00000000000000000000
```