# 迴歸 ## 1\. 準備工作:載入套件與資料 ```python import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path # --- Statsmodels --- import statsmodels.api as sm import statsmodels.formula.api as smf from statsmodels.stats.outliers_influence import OLSInfluence # --- ISLP --- from ISLP.models import ModelSpec from ISLP import confusion_table # --- Scikit-learn --- from sklearn.linear_model import ( LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, LogisticRegression ) from sklearn.preprocessing import StandardScaler from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # --- Scipy --- from scipy.stats import linregress # --- Metrics --- from sklearn.metrics import ( precision_recall_fscore_support, roc_curve, accuracy_score, auc ) # --------------------------------------------- # 1. 迴歸資料 (house) # --------------------------------------------- data_house = { 'PropertyType': ['Single Family', 'Townhouse', 'Multiplex', 'Single Family', 'Townhouse', 'Single Family', 'Multiplex', 'Single Family', 'Townhouse', 'Multiplex', 'Single Family', 'Townhouse', 'Single Family', 'Multiplex'], # n=14 'SqFtTotLiving': [1500, 1200, 2000, 1800, 1300, 2500, 2100, 1600, 1300, 2100, 1900, 1400, 2600, 2200], 'SqFtLot': [5000, 2000, 6000, 7000, 2200, 8000, 6100, 5200, 2100, 6200, 7100, 2300, 8100, 6300], 'Bathrooms': [2.5, 2.0, 3.0, 2.5, 2.5, 3.5, 3.0, 2.5, 2.0, 3.0, 2.5, 2.5, 3.5, 3.0], 'Bedrooms': [3, 2, 4, 4, 3, 5, 4, 3, 2, 4, 4, 3, 5, 4], 'BldgGrade': [7, 8, 7, 9, 8, 10, 8, 7, 8, 7, 9, 8, 10, 8], 'ZipCode': [98105, 98103, 98105, 98115, 98103, 98105, 98115, 98105, 98103, 98105, 98115, 98103, 98105, 98115], 'AdjSalePrice': [500000, 400000, 600000, 700000, 450000, 900000, 620000, 510000, 410000, 610000, 710000, 460000, 910000, 630000], 'YrBuilt': [1990, 2005, 1980, 2000, 2008, 2015, 1985, 1991, 2006, 1981, 2001, 2009, 2016, 1986], 'NbrLivingUnits': [1, 1, 3, 1, 1, 1, 3, 1, 1, 3, 1, 1, 1, 3], 'SqFtFinBasement': [0, 0, 500, 200, 0, 800, 550, 0, 0, 500, 200, 0, 800, 550], 'YrRenovated': [0, 0, 0, 2010, 0, 0, 2018, 0, 0, 0, 2010, 0, 0, 2018], 'NewConstruction': [False, True, False, False, True, True, False, False, True, False, False, True, True, False] } house = pd.DataFrame(data_house) house['PropertyType'] = house['PropertyType'].astype('category') house['ZipCode'] = house['ZipCode'].astype('category') outcome_reg = 'AdjSalePrice' # 迴歸的 outcome # --------------------------------------------- # 2. 分類資料 (loan_data, loan3000) # --------------------------------------------- N_SAMPLES = 500 borrower_score = np.random.uniform(0.3, 0.75, N_SAMPLES) payment_inc_ratio = np.random.uniform(1.0, 25.0, N_SAMPLES) purpose_cats = ['debt_consolidation', 'credit_card', 'home_improvement', 'other'] home_cats = ['RENT', 'MORTGAGE', 'OWN'] emp_len_cats = ['< 1 Year', '1-3 Years', '4-6 Years', '7+ Years'] purpose_ = np.random.choice(purpose_cats, N_SAMPLES) home_ = np.random.choice(home_cats, N_SAMPLES) emp_len_ = np.random.choice(emp_len_cats, N_SAMPLES) true_logit = ( -2.0 + 5.0 * (purpose_ == 'debt_consolidation') + 3.0 * (purpose_ == 'credit_card') - 1.5 * (home_ == 'MORTGAGE') - 2.0 * (home_ == 'OWN') + 0.1 * payment_inc_ratio - 8.0 * borrower_score + np.random.normal(0, 0.5, N_SAMPLES) ) prob_default = 1 / (1 + np.exp(-true_logit)) y_numbers_cls = np.random.binomial(1, prob_default, size=N_SAMPLES) outcome_cls = ['default' if y == 1 else 'paid off' for y in y_numbers_cls] loan_data = pd.DataFrame({ 'outcome': outcome_cls, 'purpose_': purpose_, 'home_': home_, 'emp_len_': emp_len_, 'borrower_score': borrower_score, 'payment_inc_ratio': payment_inc_ratio, }) loan_data.outcome = loan_data.outcome.astype('category') loan_data.purpose_ = loan_data.purpose_.astype('category') loan_data.home_ = loan_data.home_.astype('category') loan_data.emp_len_ = loan_data.emp_len_.astype('category') outcome_cls_name = 'outcome' # 分類的 outcome data_3000 = { 'outcome': np.random.choice(['paid off', 'default'], 100, p=[0.7, 0.3]), 'borrower_score': np.random.uniform(0.15, 0.8, 100), 'payment_inc_ratio': np.random.uniform(0.5, 20.0, 100), } loan3000 = pd.DataFrame(data_3000) loan3000.loc[0, 'outcome'] = 'paid off' loan3000.loc[1, 'outcome'] = 'default' loan3000.outcome = loan3000.outcome.astype('category') print("--- 所有套件與資料準備完成 ---") ``` ## 2\. 線性迴歸的基本使用 (Fitting & Prediction) ```python basic_predictors = ['SqFtTotLiving', 'Bathrooms'] X_basic_df = house[basic_predictors] y_basic = house[outcome_reg] # --- A. sklearn --- model_sklearn = LinearRegression() model_sklearn.fit(X_basic_df, y_basic) print(f"--- scikit-learn ---") print(f"截距 (Intercept): {model_sklearn.intercept_:.3f}") for name, coef in zip(basic_predictors, model_sklearn.coef_): print(f" {name}: {coef:.3f}") # --- B. statsmodels --- print(f"\n--- statsmodels + ModelSpec ---") design_basic = ModelSpec(basic_predictors) X_design_basic = design_basic.fit_transform(X_basic_df) print("由 ModelSpec 產生的設計矩陣 (前 5 筆):") print(X_design_basic.head()) model_sm = sm.OLS(y_basic, X_design_basic) results_sm = model_sm.fit() print("\n模型摘要 (Summary):") print(results_sm.summary()) ``` ## 3\. 迴歸中的因子變數 (Factor variables) ### 虛擬變數 (Dummy Variables) 表示法 #### A. `sklearn` 方式 (手動使用 `pd.get_dummies`) ```python predictors_factor_list = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade', 'PropertyType'] X_sklearn_factor = pd.get_dummies(house[predictors_factor_list], drop_first=True, dtype=int) y = house[outcome_reg] house_lm_factor = LinearRegression() house_lm_factor.fit(X_sklearn_factor, y) print(f'\n--- sklearn 模型的係數 ---') print(f'Intercept: {house_lm_factor.intercept_:.3f}') for name, coef in zip(X_sklearn_factor.columns, house_lm_factor.coef_): print(f' {name}: {coef:.3f}') ``` #### B. `statsmodels` + `ModelSpec` 方式 (自動處理) ```python # 記得要先用 astype('category') 把DataFrame對應欄位變成類別變數 design_factor = ModelSpec(predictors_factor_list) X_sm_factor = design_factor.fit_transform(house) print("\n--- ModelSpec 自動產生的設計矩陣 (含 dummies) ---") print(X_sm_factor.head()) model_sm_factor = sm.OLS(y, X_sm_factor) results_sm_factor = model_sm_factor.fit() print(results_sm_factor.summary()) ``` ## 4\. 正規化 (Regularization) ⚖️ ### 1\. 資料準備:特徵工程與標準化 ```python predictors_reg = ['SqFtTotLiving', 'SqFtLot', 'Bathrooms', 'Bedrooms', 'BldgGrade', 'PropertyType', 'NbrLivingUnits', 'SqFtFinBasement', 'YrBuilt', 'YrRenovated', 'NewConstruction'] X_raw = pd.get_dummies(house[predictors_reg], drop_first=True) X_raw['NewConstruction'] = X_raw['NewConstruction'].astype(int) columns = X_raw.columns scaler = StandardScaler() X_scaled = scaler.fit_transform(X_raw) y = house[outcome_reg] print("\n--- 正規化資料準備完成 ---") ``` ### 2\. Lasso (L1) 正規化 📉 ```python alpha_values = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] lasso_results = [] for alpha in alpha_values: model = Lasso(alpha=alpha) model.fit(X_scaled, y) lasso_results.append(model.coef_) model_lasso_cv = LassoCV(cv=5, alphas=alpha_values, max_iter=10000) model_lasso_cv.fit(X_scaled, y) print(f"\n--- Lasso (L1) 結果 ---") print(f"LassoCV 找到的最佳 alpha: {model_lasso_cv.alpha_:.4f}") ax = pd.DataFrame(lasso_results, index=alpha_values, columns=columns).plot( logx=True, title='Lasso Coefficient Paths', legend=False, figsize=(10, 6) ) ax.axvline(model_lasso_cv.alpha_, linestyle='--', color='k', label=f'Optimal alpha (CV = {model_lasso_cv.alpha_:.2f})') ax.set_xlabel('Alpha (log scale)'); ax.set_ylabel('Coefficient Weight') ax.legend(); plt.show() ``` ### 3\. Ridge (L2) 正規化 📉 ```python alpha_values = [0.01, 0.1, 1, 10, 100, 1000, 10000, 100000] ridge_results = [] for alpha in alpha_values: model = Ridge(alpha=alpha) model.fit(X_scaled, y) ridge_results.append(model.coef_) model_ridge_cv = RidgeCV(cv=5, alphas=alpha_values) model_ridge_cv.fit(X_scaled, y) print(f"\n--- Ridge (L2) 結果 ---") print(f"RidgeCV 找到的最佳 alpha: {model_ridge_cv.alpha_:.4f}") ax = pd.DataFrame(ridge_results, index=alpha_values, columns=columns).plot( logx=True, title='Ridge Coefficient Paths', legend=False, figsize=(10, 6) ) ax.axvline(model_ridge_cv.alpha_, linestyle='--', color='k', label=f'Optimal alpha (CV = {model_ridge_cv.alpha_:.2f})') ax.set_xlabel('Alpha (log scale)'); ax.set_ylabel('Coefficient Weight') ax.legend(); plt.show() ``` ----- ----- # 分類 ## 1\. 準備工作 (所有資料已在「迴歸」 Section 1 載入,此處為空白) ## 2\. 樸素貝氏 (Naive Bayes) 🤖 ### 高斯貝氏解 (Gaussian Solution) ```python # --- 資料準備 --- predictors_gnb = ['borrower_score', 'payment_inc_ratio'] X_gnb = loan_data[predictors_gnb] y_gnb = loan_data[outcome_cls_name] # --- 擬合模型 --- naive_model = GaussianNB() naive_model.fit(X_gnb, y_gnb) # --- 進行預測 --- new_loan = pd.DataFrame([ {'borrower_score': 0.6, 'payment_inc_ratio': 10.0} ]) print(f"範例資料:") print(new_loan) print(f"\n預測類別: {naive_model.predict(new_loan)[0]}") probabilities = pd.DataFrame(naive_model.predict_proba(new_loan), columns=naive_model.classes_) print("\n預測機率:") print(probabilities) ``` ## 3\. 判別分析 (Discriminant Analysis) 📈 ### 一個簡單的範例 (A Simple Example) ```python # --- 資料準備 --- predictors_lda = ['borrower_score', 'payment_inc_ratio'] X_lda = loan3000[predictors_lda] y_lda = loan3000[outcome_cls_name] # --- 擬合 LDA 模型 --- loan_lda = LinearDiscriminantAnalysis() loan_lda.fit(X_lda, y_lda) print("\n--- LDA 結果 ---") print("LDA 判別函數係數 (scalings_):") print(pd.DataFrame(loan_lda.scalings_, index=X_lda.columns)) pred_lda = pd.DataFrame(loan_lda.predict_proba(loan3000[predictors_lda]), columns=loan_lda.classes_) print("\nLDA 預測機率 (前 5 筆):") print(pred_lda.head()) # --- 繪製決策邊界 --- center = np.mean(loan_lda.means_, axis=0) slope = - loan_lda.scalings_[0] / loan_lda.scalings_[1] intercept = center[1] - center[0] * slope x_0 = (0 - intercept) / slope x_20 = (20 - intercept) / slope lda_df = pd.concat([loan3000, pred_lda['default']], axis=1) fig, ax = plt.subplots(figsize=(4, 4)) sns.scatterplot(x='borrower_score', y='payment_inc_ratio', hue='default', data=lda_df, palette=sns.diverging_palette(240, 10, n=9, as_cmap=True), ax=ax, legend=False) ax.set_ylim(0, 20); ax.set_xlim(0.15, 0.8) ax.plot((x_0, x_20), (0, 20), linewidth=3, color='black') ax.plot(*loan_lda.means_.transpose(), '--', color='grey') plt.tight_layout(); plt.show() ``` ## 4\. 羅吉斯迴歸 (Logistic Regression) 📊 ### Logit 函數與勝算 (Odds) ```python p = np.arange(0.01, 1, 0.01) df = pd.DataFrame({ 'p': p, 'logit': np.log(p / (1 - p)), 'odds': p / (1 - p), }) fig, ax = plt.subplots(figsize=(3, 3)) ax.axhline(0, color='grey', linestyle='--'); ax.axvline(0.5, color='grey', linestyle='--') ax.plot(df['p'], df['logit']) ax.set_xlabel('Probability (p)'); ax.set_ylabel('logit(p)') plt.tight_layout(); plt.show() ``` ### 羅吉斯迴歸與 GLM (Logistic Regression and the GLM) #### 4a. 使用 `scikit-learn` (搭配 `pd.get_dummies`) ```python # --- 資料準備 (手動 Dummies) --- predictors_lr = ['payment_inc_ratio', 'purpose_', 'home_', 'emp_len_', 'borrower_score'] X_lr = pd.get_dummies(loan_data[predictors_lr], prefix='', prefix_sep='', drop_first=True, dtype=int) # 1 = 'default', 0 = 'paid off' y_numbers_lr = [1 if yi == 'default' else 0 for yi in loan_data[outcome_cls_name]] # 儲存原始類別 y,供後續評估使用 y_categorical_lr = loan_data[outcome_cls_name] # --- 擬合 sklearn 模型 --- logit_reg = LogisticRegression(penalty='l2', C=1e42, solver='liblinear') logit_reg.fit(X_lr, y_numbers_lr) print('\n--- sklearn.LogisticRegression 結果 (預測 y=1) ---') print(f"截距 (Intercept): {logit_reg.intercept_[0]:.4f}") print(f"預測的類S別: {logit_reg.classes_}") # [0 1] print("\n係數 (Coefficients):") print(pd.DataFrame({'coeff': logit_reg.coef_[0]}, index=X_lr.columns).head()) ``` #### 4b. 使用 `statsmodels` (搭配 `ModelSpec`) ```python # --- 資料準備 --- design = ModelSpec(predictors_lr) X_sm = design.fit_transform(loan_data) # y_numbers_lr 來自 4a print("\n--- statsmodels (使用 ModelSpec 自動 Dummies) ---") # print(X_sm.head()) # 關閉以保持簡潔 # --- 擬合模型 --- logit_reg_sm = sm.GLM(y_numbers_lr, X_sm, family=sm.families.Binomial()) logit_result = logit_reg_sm.fit() print(logit_result.summary()) # 註:statsmodels 的係數是預測 "default" (y=1) 的 log-odds。 # 由於 4a 的 sklearn 也被設為預測 y=1, # 兩者的係數(如 borrower_score)現在應該符號相同且量級相近。 ``` ## 5\. 評估分類模型 (Evaluating Classification Models) 🎯 ### 混淆矩陣 (Confusion Matrix) ```python # 1. 取得 0/1 預測 pred_numbers = logit_reg.predict(X_lr) # 使用 4a 的 X_lr # 2. 將 0/1 預測映射回原始類別 pred_categorical = ['default' if p == 1 else 'paid off' for p in pred_numbers] # 3. 取得原始類別的 y y_categorical = loan_data[outcome_cls_name] # --- 使用 ISLP.confusion_table --- conf_mat_islp = confusion_table(pred_categorical, y_categorical) print("--- ISLP.confusion_table ---") print(conf_mat_islp) ``` ### 準確率 (Accuracy), 精確率 (Precision), 召回率 (Recall) ```python # 1. 使用 scikit-learn 計算 Accuracy print(f"--- sklearn.accuracy_score ---") print(f"Accuracy: {accuracy_score(y_categorical, pred_categorical):.4f}") # 2. 使用 scikit-learn 計算 Precision, Recall, F-score # 這會告訴函數只回傳 pos_label ('default') 的指標, # 並將 precision, recall, fscore 作為單一浮點數回傳。 precision, recall, fscore, support = precision_recall_fscore_support( y_categorical, pred_categorical, pos_label='default', average='weighted' # 請比較binary或其他選項 ) print(f"\n--- sklearn.precision_recall_fscore_support (for 'default') ---") # 現在 precision, recall, fscore 都是單一數字,可以被 :.4f 格式化 print(f"Precision: {precision:.4f}") print(f"Recall: {recall:.4f}") print(f"F-Score: {fscore:.4f}") ``` ### ROC 曲線 (ROC Curve) ```python # 預測 P(y=1) ('default') 的機率 y_prob = logit_reg.predict_proba(X_lr)[:, 1] # 計算 ROC 曲線 fpr, tpr, thresholds = roc_curve(y_categorical, y_prob, pos_label='default') roc_df = pd.DataFrame({'recall': tpr, 'specificity': 1 - fpr}) ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False) ax.set_ylim(0, 1); ax.set_xlim(1, 0) ax.plot((1, 0), (0, 1), linestyle='--', color='grey') ax.set_xlabel('Specificity'); ax.set_ylabel('Recall (TPR)') ax.set_title('ROC Curve') plt.tight_layout() plt.show() ``` ### AUC (Area Under the Curve) ```python # --- 使用 sklearn.metrics.auc --- auc_score = auc(fpr, tpr) print(f"ROC AUC Score (calculated from fpr, tpr): {auc_score:.4f}") # 3. 繪製帶有 AUC 填充的 ROC 圖 ax = roc_df.plot(x='specificity', y='recall', figsize=(4, 4), legend=False) ax.set_ylim(0, 1); ax.set_xlim(1, 0) ax.set_xlabel('Specificity'); ax.set_ylabel('Recall') ax.fill_between(roc_df.specificity, 0, roc_df.recall, alpha=0.3) ax.set_title(f'ROC Curve (AUC = {auc_score:.4f})') plt.tight_layout() plt.show() ```