# 迴歸
## 1\. 準備工作:載入套件與資料
```python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
# --- Statsmodels ---
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import OLSInfluence
# --- ISLP ---
from ISLP.models import ModelSpec
from ISLP import confusion_table
# --- Scikit-learn ---
from sklearn.linear_model import (
LinearRegression, Lasso, LassoCV, Ridge, RidgeCV,
LogisticRegression
)
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# --- Scipy ---
from scipy.stats import linregress
# --- Metrics ---
from sklearn.metrics import (
precision_recall_fscore_support,
roc_curve,
accuracy_score,
auc
)
# ---------------------------------------------
# 1. 迴歸資料 (house)
# ---------------------------------------------
data_house = {
'PropertyType': ['Single Family', 'Townhouse', 'Multiplex', 'Single Family', 'Townhouse', 'Single Family', 'Multiplex',
'Single Family', 'Townhouse', 'Multiplex', 'Single Family', 'Townhouse', 'Single Family', 'Multiplex'], # n=14
'SqFtTotLiving': [1500, 1200, 2000, 1800, 1300, 2500, 2100, 1600, 1300, 2100, 1900, 1400, 2600, 2200],
'SqFtLot': [5000, 2000, 6000, 7000, 2200, 8000, 6100, 5200, 2100, 6200, 7100, 2300, 8100, 6300],
'Bathrooms': [2.5, 2.0, 3.0, 2.5, 2.5, 3.5, 3.0, 2.5, 2.0, 3.0, 2.5, 2.5, 3.5, 3.0],
'Bedrooms': [3, 2, 4, 4, 3, 5, 4, 3, 2, 4, 4, 3, 5, 4],
'BldgGrade': [7, 8, 7, 9, 8, 10, 8, 7, 8, 7, 9, 8, 10, 8],
'ZipCode': [98105, 98103, 98105, 98115, 98103, 98105, 98115, 98105, 98103, 98105, 98115, 98103, 98105, 98115],
'AdjSalePrice': [500000, 400000, 600000, 700000, 450000, 900000, 620000, 510000, 410000, 610000, 710000, 460000, 910000, 630000],
'YrBuilt': [1990, 2005, 1980, 2000, 2008, 2015, 1985, 1991, 2006, 1981, 2001, 2009, 2016, 1986],
'NbrLivingUnits': [1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 3],
'SqFtFinBasement': [0, 0, 500, 200, 0, 800, 550, 0, 0, 500, 200, 0, 800, 550],
'YrRenovated': [0, 0, 0, 2010, 0, 0, 2018, 0, 0, 0, 2010, 0, 0, 2018],
'NewConstruction': [False, True, False, False, True, True, False, False, True, False, False, True, True, False]
}
house = pd.DataFrame(data_house)
house['PropertyType'] = house['PropertyType'].astype('category')
house['ZipCode'] = house['ZipCode'].astype('category')
outcome_reg = 'AdjSalePrice' # 迴歸的 outcome
# ---------------------------------------------
# 2. 分類資料 (loan_data, loan3000)
# ---------------------------------------------
N_SAMPLES = 500
borrower_score = np.random.uniform(0.3, 0.75, N_SAMPLES)
payment_inc_ratio = np.random.uniform(1.0, 25.0, N_SAMPLES)
purpose_cats = ['debt_consolidation', 'credit_card', 'home_improvement', 'other']
home_cats = ['RENT', 'MORTGAGE', 'OWN']
emp_len_cats = ['< 1 Year', '1-3 Years', '4-6 Years', '7+ Years']
purpose_ = np.random.choice(purpose_cats, N_SAMPLES)
home_ = np.random.choice(home_cats, N_SAMPLES)
emp_len_ = np.random.choice(emp_len_cats, N_SAMPLES)
true_logit = (
-2.0 + 5.0 * (purpose_ == 'debt_consolidation') + 3.0 * (purpose_ == 'credit_card')
- 1.5 * (home_ == 'MORTGAGE') - 2.0 * (home_ == 'OWN')
+ 0.1 * payment_inc_ratio - 8.0 * borrower_score
+ np.random.normal(0, 0.5, N_SAMPLES)
)
prob_default = 1 / (1 + np.exp(-true_logit))
y_numbers_cls = np.random.binomial(1, prob_default, size=N_SAMPLES)
outcome_cls = ['default' if y == 1 else 'paid off' for y in y_numbers_cls]
loan_data = pd.DataFrame({
'outcome': outcome_cls, 'purpose_': purpose_, 'home_': home_,
'emp_len_': emp_len_, 'borrower_score': borrower_score,
'payment_inc_ratio': payment_inc_ratio,
})
loan_data.outcome = loan_data.outcome.astype('category')
loan_data.purpose_ = loan_data.purpose_.astype('category')
loan_data.home_ = loan_data.home_.astype('category')
loan_data.emp_len_ = loan_data.emp_len_.astype('category')
outcome_cls_name = 'outcome' # 分類的 outcome
data_3000 = {
'outcome': np.random.choice(['paid off', 'default'], 100, p=[0.7, 0.3]),
'borrower_score': np.random.uniform(0.15, 0.8, 100),
'payment_inc_ratio': np.random.uniform(0.5, 20.0, 100),
}
loan3000 = pd.DataFrame(data_3000)
loan3000.loc[0, 'outcome'] = 'paid off'
loan3000.loc[1, 'outcome'] = 'default'
loan3000.outcome = loan3000.outcome.astype('category')
print("--- 所有套件與資料準備完成 ---")
```
## 2\. 線性迴歸的基本使用 (Fitting & Prediction)
```python
basic_predictors = ['SqFtTotLiving', 'Bathrooms']
X_basic_df = house[basic_predictors]
y_basic = house[outcome_reg]
# --- A. sklearn ---
model_sklearn = LinearRegression()
model_sklearn.fit(X_basic_df, y_basic)
print(f"--- scikit-learn ---")
print(f"截距 (Intercept): {model_sklearn.intercept_:.3f}")
for name, coef in zip(basic_predictors, model_sklearn.coef_):
print(f" {name}: {coef:.3f}")
# --- B. statsmodels ---
print(f"\n--- statsmodels + ModelSpec ---")
design_basic = ModelSpec(basic_predictors)
X_design_basic = design_basic.fit_transform(X_basic_df)
print("由 ModelSpec 產生的設計矩陣 (前 5 筆):")
print(X_design_basic.head())
model_sm = sm.OLS(y_basic, X_design_basic)
results_sm = model_sm.fit()
print("\n模型摘要 (Summary):")
print(results_sm.summary())
```
## 3\. 迴歸中的因子變數 (Factor variables)
### 虛擬變數 (Dummy Variables) 表示法
#### A. `sklearn` 方式 (手動使用 `pd.get_dummies`)
```python
predictors_factor_list = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms',
'BldgGrade', 'PropertyType']
X_sklearn_factor = pd.get_dummies(house[predictors_factor_list], drop_first=True, dtype=int)
y = house[outcome_reg]
house_lm_factor = LinearRegression()
house_lm_factor.fit(X_sklearn_factor, y)
print(f'\n--- sklearn 模型的係數 ---')
print(f'Intercept: {house_lm_factor.intercept_:.3f}')
for name, coef in zip(X_sklearn_factor.columns, house_lm_factor.coef_):
print(f' {name}: {coef:.3f}')
```
#### B. `statsmodels` + `ModelSpec` 方式 (自動處理)
```python
# 記得要先用 astype('category') 把DataFrame對應欄位變成類別變數
design_factor = ModelSpec(predictors_factor_list)
X_sm_factor = design_factor.fit_transform(house)
print("\n--- ModelSpec 自動產生的設計矩陣 (含 dummies) ---")
print(X_sm_factor.head())
model_sm_factor = sm.OLS(y, X_sm_factor)
results_sm_factor = model_sm_factor.fit()
print(results_sm_factor.summary())
```
## 4\. 正規化 (Regularization) ⚖️
### 1\. 資料準備:特徵工程與標準化
```python
predictors_reg = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms',
'BldgGrade', 'PropertyType', 'NbrLivingUnits',
'SqFtFinBasement', 'YrBuilt', 'YrRenovated',
'NewConstruction']
X_raw = pd.get_dummies(house[predictors_reg], drop_first=True)
X_raw['NewConstruction'] = X_raw['NewConstruction'].astype(int)
columns = X_raw.columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
y = house[outcome_reg]
print("\n--- 正規化資料準備完成 ---")
```
### 2\. Lasso (L1) 正規化 📉
```python
alpha_values = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
lasso_results = []
for alpha in alpha_values:
model = Lasso(alpha=alpha)
model.fit(X_scaled, y)
lasso_results.append(model.coef_)
model_lasso_cv = LassoCV(cv=5, alphas=alpha_values, max_iter=10000)
model_lasso_cv.fit(X_scaled, y)
print(f"\n--- Lasso (L1) 結果 ---")
print(f"LassoCV 找到的最佳 alpha: {model_lasso_cv.alpha_:.4f}")
ax = pd.DataFrame(lasso_results, index=alpha_values, columns=columns).plot(
logx=True, title='Lasso Coefficient Paths', legend=False, figsize=(10, 6)
)
ax.axvline(model_lasso_cv.alpha_, linestyle='--', color='k', label=f'Optimal alpha (CV = {model_lasso_cv.alpha_:.2f})')
ax.set_xlabel('Alpha (log scale)'); ax.set_ylabel('Coefficient Weight')
ax.legend(); plt.show()
```
### 3\. Ridge (L2) 正規化 📉
```python
alpha_values = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
ridge_results = []
for alpha in alpha_values:
model = Ridge(alpha=alpha)
model.fit(X_scaled, y)
ridge_results.append(model.coef_)
model_ridge_cv = RidgeCV(cv=5, alphas=alpha_values)
model_ridge_cv.fit(X_scaled, y)
print(f"\n--- Ridge (L2) 結果 ---")
print(f"RidgeCV 找到的最佳 alpha: {model_ridge_cv.alpha_:.4f}")
ax = pd.DataFrame(ridge_results, index=alpha_values, columns=columns).plot(
logx=True, title='Ridge Coefficient Paths', legend=False, figsize=(10, 6)
)
ax.axvline(model_ridge_cv.alpha_, linestyle='--', color='k', label=f'Optimal alpha (CV = {model_ridge_cv.alpha_:.2f})')
ax.set_xlabel('Alpha (log scale)'); ax.set_ylabel('Coefficient Weight')
ax.legend(); plt.show()
```
-----
-----
# 分類
## 1\. 準備工作
(所有資料已在「迴歸」 Section 1 載入,此處為空白)
## 2\. 樸素貝氏 (Naive Bayes) 🤖
### 高斯貝氏解 (Gaussian Solution)
```python
# --- 資料準備 ---
predictors_gnb = ['borrower_score', 'payment_inc_ratio']
X_gnb = loan_data[predictors_gnb]
y_gnb = loan_data[outcome_cls_name]
# --- 擬合模型 ---
naive_model = GaussianNB()
naive_model.fit(X_gnb, y_gnb)
# --- 進行預測 ---
new_loan = pd.DataFrame([
{'borrower_score': 0.6, 'payment_inc_ratio': 10.0}
])
print(f"範例資料:")
print(new_loan)
print(f"\n預測類別: {naive_model.predict(new_loan)[0]}")
probabilities = pd.DataFrame(naive_model.predict_proba(new_loan),
columns=naive_model.classes_)
print("\n預測機率:")
print(probabilities)
```
## 3\. 判別分析 (Discriminant Analysis) 📈
### 一個簡單的範例 (A Simple Example)
```python
# --- 資料準備 ---
predictors_lda = ['borrower_score', 'payment_inc_ratio']
X_lda = loan3000[predictors_lda]
y_lda = loan3000[outcome_cls_name]
# --- 擬合 LDA 模型 ---
loan_lda = LinearDiscriminantAnalysis()
loan_lda.fit(X_lda, y_lda)
print("\n--- LDA 結果 ---")
print("LDA 判別函數係數 (scalings_):")
print(pd.DataFrame(loan_lda.scalings_, index=X_lda.columns))
pred_lda = pd.DataFrame(loan_lda.predict_proba(loan3000[predictors_lda]),
columns=loan_lda.classes_)
print("\nLDA 預測機率 (前 5 筆):")
print(pred_lda.head())
# --- 繪製決策邊界 ---
center = np.mean(loan_lda.means_, axis=0)
slope = - loan_lda.scalings_[0] / loan_lda.scalings_[1]
intercept = center[1] - center[0] * slope
x_0 = (0 - intercept) / slope
x_20 = (20 - intercept) / slope
lda_df = pd.concat([loan3000, pred_lda['default']], axis=1)
fig, ax = plt.subplots(figsize=(4, 4))
sns.scatterplot(x='borrower_score', y='payment_inc_ratio',
hue='default', data=lda_df,
palette=sns.diverging_palette(240, 10, n=9, as_cmap=True),
ax=ax, legend=False)
ax.set_ylim(0, 20); ax.set_xlim(0.15, 0.8)
ax.plot((x_0, x_20), (0, 20), linewidth=3, color='black')
ax.plot(*loan_lda.means_.transpose(), '--', color='grey')
plt.tight_layout(); plt.show()
```
## 4\. 羅吉斯迴歸 (Logistic Regression) 📊
### Logit 函數與勝算 (Odds)
```python
p = np.arange(0.01, 1, 0.01)
df = pd.DataFrame({
'p': p,
'logit': np.log(p / (1 - p)),
'odds': p / (1 - p),
})
fig, ax = plt.subplots(figsize=(3, 3))
ax.axhline(0, color='grey', linestyle='--'); ax.axvline(0.5, color='grey', linestyle='--')
ax.plot(df['p'], df['logit'])
ax.set_xlabel('Probability (p)'); ax.set_ylabel('logit(p)')
plt.tight_layout(); plt.show()
```
### 羅吉斯迴歸與 GLM (Logistic Regression and the GLM)
#### 4a. 使用 `scikit-learn` (搭配 `pd.get_dummies`)
```python
# --- 資料準備 (手動 Dummies) ---
predictors_lr = ['payment_inc_ratio', 'purpose_', 'home_', 'emp_len_',
'borrower_score']
X_lr = pd.get_dummies(loan_data[predictors_lr], prefix='', prefix_sep='',
drop_first=True, dtype=int)
# 1 = 'default', 0 = 'paid off'
y_numbers_lr = [1 if yi == 'default' else 0 for yi in loan_data[outcome_cls_name]]
# 儲存原始類別 y,供後續評估使用
y_categorical_lr = loan_data[outcome_cls_name]
# --- 擬合 sklearn 模型 ---
logit_reg = LogisticRegression(penalty='l2', C=1e42, solver='liblinear')
logit_reg.fit(X_lr, y_numbers_lr)
print('\n--- sklearn.LogisticRegression 結果 (預測 y=1) ---')
print(f"截距 (Intercept): {logit_reg.intercept_[0]:.4f}")
print(f"預測的類S別: {logit_reg.classes_}") # [0 1]
print("\n係數 (Coefficients):")
print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X_lr.columns).head())
```
#### 4b. 使用 `statsmodels` (搭配 `ModelSpec`)
```python
# --- 資料準備 ---
design = ModelSpec(predictors_lr)
X_sm = design.fit_transform(loan_data)
# y_numbers_lr 來自 4a
print("\n--- statsmodels (使用 ModelSpec 自動 Dummies) ---")
# print(X_sm.head()) # 關閉以保持簡潔
# --- 擬合模型 ---
logit_reg_sm = sm.GLM(y_numbers_lr, X_sm,
family=sm.families.Binomial())
logit_result = logit_reg_sm.fit()
print(logit_result.summary())
# 註:statsmodels 的係數是預測 "default" (y=1) 的 log-odds。
# 由於 4a 的 sklearn 也被設為預測 y=1,
# 兩者的係數(如 borrower_score)現在應該符號相同且量級相近。
```
## 5\. 評估分類模型 (Evaluating Classification Models) 🎯
### 混淆矩陣 (Confusion Matrix)
```python
# 1. 取得 0/1 預測
pred_numbers = logit_reg.predict(X_lr) # 使用 4a 的 X_lr
# 2. 將 0/1 預測映射回原始類別
pred_categorical = ['default' if p == 1 else 'paid off' for p in pred_numbers]
# 3. 取得原始類別的 y
y_categorical = loan_data[outcome_cls_name]
# --- 使用 ISLP.confusion_table ---
conf_mat_islp = confusion_table(pred_categorical, y_categorical)
print("--- ISLP.confusion_table ---")
print(conf_mat_islp)
```
### 準確率 (Accuracy), 精確率 (Precision), 召回率 (Recall)
```python
# 1. 使用 scikit-learn 計算 Accuracy
print(f"--- sklearn.accuracy_score ---")
print(f"Accuracy: {accuracy_score(y_categorical, pred_categorical):.4f}")
# 2. 使用 scikit-learn 計算 Precision, Recall, F-score
# 這會告訴函數只回傳 pos_label ('default') 的指標,
# 並將 precision, recall, fscore 作為單一浮點數回傳。
precision, recall, fscore, support = precision_recall_fscore_support(
y_categorical, pred_categorical,
pos_label='default',
average='weighted' # 請比較binary或其他選項
)
print(f"\n--- sklearn.precision_recall_fscore_support (for 'default') ---")
# 現在 precision, recall, fscore 都是單一數字,可以被 :.4f 格式化
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F-Score: {fscore:.4f}")
```
### ROC 曲線 (ROC Curve)
```python
# 預測 P(y=1) ('default') 的機率
y_prob = logit_reg.predict_proba(X_lr)[:, 1]
# 計算 ROC 曲線
fpr, tpr, thresholds = roc_curve(y_categorical, y_prob, pos_label='default')
roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr})
ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1); ax.set_xlim(1, 0)
ax.plot((1, 0), (0, 1), linestyle='--', color='grey')
ax.set_xlabel('Specificity'); ax.set_ylabel('Recall (TPR)')
ax.set_title('ROC Curve')
plt.tight_layout()
plt.show()
```
### AUC (Area Under the Curve)
```python
# --- 使用 sklearn.metrics.auc ---
auc_score = auc(fpr, tpr)
print(f"ROC AUC Score (calculated from fpr, tpr): {auc_score:.4f}")
# 3. 繪製帶有 AUC 填充的 ROC 圖
ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False)
ax.set_ylim(0, 1); ax.set_xlim(1, 0)
ax.set_xlabel('Specificity'); ax.set_ylabel('Recall')
ax.fill_between(roc_df.specificity, 0, roc_df.recall, alpha=0.3)
ax.set_title(f'ROC Curve (AUC = {auc_score:.4f})')
plt.tight_layout()
plt.show()
```