import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_excel('2024-12-06公眾號Python機器學習AI.xlsx')
from sklearn.model_selection import train_test_split, KFold
X = df.drop(['Y'],axis=1)
y = df['Y']
# 劃分訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
random_state=42)
導入必要的庫,讀取一個包含特征X和目標變量Y的Excel數據集,將其分成訓練集和測試集,目標是構建回歸模型進行預測,其中特征X和目標變量Y是針對回歸任務設計的。
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
# 定義一級學習器
base_learners = [
("RF", RandomForestRegressor(n_estimators=100, random_state=42)),
("XGB", XGBRegressor(n_estimators=100, random_state=42, verbosity=0)),
("LGBM", LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)),
("GBM", GradientBoostingRegressor(n_estimators=100, random_state=42)),
("AdaBoost", AdaBoostRegressor(n_estimators=100, random_state=42)),
("CatBoost", CatBoostRegressor(n_estimators=100, random_state=42, verbose=0))
]
# 定義二級學習器
meta_model = LinearRegression()
# 創建Stacking回歸器
stacking_regressor = StackingRegressor(estimators=base_learners, final_estimator=meta_model, cv=5)
# 訓練模型
stacking_regressor.fit(X_train, y_train)
通過集成學習中的Stacking方法,結合多種回歸模型(包括隨機森林、XGBoost、LightGBM、Gradient Boosting、AdaBoost 和 CatBoost)作為一級學習器,利用線性回歸作為二級元學習器,學習各模型預測的最佳組合方式。最終通過5折交叉驗證優化模型性能,并在訓練數據上訓練Stacking回歸器,以提升整體的回歸預測效果。
如果將代碼用于分類模型,主要變化如下:
模型選擇:將所有的回歸模型替換為對應的分類模型,例如:
RandomForestRegressor→RandomForestClassifier
GradientBoostingRegressor→GradientBoostingClassifier
AdaBoostRegressor→AdaBoostClassifier
XGBRegressor→XGBClassifier
LGBMRegressor→LGBMClassifier
CatBoostRegressor→CatBoostClassifier
元學習器:將LinearRegression替換為適合分類任務的模型,如LogisticRegression或其他分類器,目標變量(y):確保 y 是分類標簽而非連續值,StackingClassifier:使用StackingClassifier而不是StackingRegressor。
from sklearn import metrics
# 真實值
y_train_true = y_train.values
y_test_true = y_test.values
# 預測值
y_pred_train = stacking_regressor.predict(X_train)
y_pred_test = stacking_regressor.predict(X_test)
# 計算訓練集的指標
mse_train = metrics.mean_squared_error(y_train_true, y_pred_train)
rmse_train = np.sqrt(mse_train)
mae_train = metrics.mean_absolute_error(y_train_true, y_pred_train)
r2_train = metrics.r2_score(y_train_true, y_pred_train)
# 計算測試集的指標
mse_test = metrics.mean_squared_error(y_test_true, y_pred_test)
rmse_test = np.sqrt(mse_test)
mae_test = metrics.mean_absolute_error(y_test_true, y_pred_test)
r2_test = metrics.r2_score(y_test_true, y_pred_test)
# 輸出結果
print("訓練集評價指標:")
print("均方誤差 (MSE):", mse_train)
print("均方根誤差 (RMSE):", rmse_train)
print("平均絕對誤差 (MAE):", mae_train)
print("擬合優度 (R-squared):", r2_train)
print("\n測試集評價指標:")
print("均方誤差 (MSE):", mse_test)
print("均方根誤差 (RMSE):", rmse_test)
print("平均絕對誤差 (MAE):", mae_test)
print("擬合優度 (R-squared):", r2_test)
全面評估模型在訓練集和測試集上的回歸性能,幫助判斷模型的擬合程度和泛化能力,從結果可以看出,模型在訓練集上的性能非常好( =0.96誤差較小),但在測試集上性能有所下降( =0.83誤差增大),這表明模型存在一定的過擬合,即在訓練數據上表現很好,但在未見數據上泛化能力稍弱。
不過,這里主要的目標是演示 Stacking 回歸器的實現,因此對模型的具體參數(如基學習器和元學習器的超參數)未進行調優,而是采用了默認配置。這種實現方式更側重于讓讀者理解Stacking的工作機制,而不是追求最佳的模型性能。如果要進一步改進,可以針對基學習器和元學習器的超參數進行優化,或引入正則化手段來減少過擬合問題。
通過散點圖和擬合線結合置信區間、直方圖,全面可視化Stacking回歸模型在訓練集和測試集上的預測表現,旨在直觀展示模型的擬合質量、預測誤差分布以及與真實值的匹配情況。
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, explained_variance_score
# 初始化結果存儲列表
train_results = []
test_results = []
# 修改 evaluate_model 函數,分別存儲訓練集和測試集的評價指標
def evaluate_model(model_name, model, X_train, y_train, X_test, y_test, cv=5):
# K折交叉驗證
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=kf, scoring='r2')
mean_cv_r2 = np.mean(cv_scores)
std_cv_r2 = np.std(cv_scores)
# 訓練模型
model.fit(X_train, y_train)
# 測試集預測
y_pred_test = model.predict(X_test)
# 訓練集預測
y_pred_train = model.predict(X_train)
# 測試集指標
r2_test = r2_score(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)
mape_test = np.mean(np.abs((y_test - y_pred_test) / y_test)) if np.all(y_test != 0) else np.nan
ev_test = explained_variance_score(y_test, y_pred_test)
test_results.append({
"CV Mean R2": mean_cv_r2,
"CV Std R2": std_cv_r2,
"R2": r2_test,
"RMSE": rmse_test,
"MAE": mae_test,
"MAPE": mape_test,
"EV": ev_test,
"Model": model_name
})
# 訓練集指標
r2_train = r2_score(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_train = mean_absolute_error(y_train, y_pred_train)
mape_train = np.mean(np.abs((y_train - y_pred_train) / y_train)) if np.all(y_train != 0) else np.nan
ev_train = explained_variance_score(y_train, y_pred_train)
train_results.append({
"R2": r2_train,
"RMSE": rmse_train,
"MAE": mae_train,
"MAPE": mape_train,
"EV": ev_train,
"Model": model_name
})
# 單獨訓練每個模型
evaluate_model("Random Forest", RandomForestRegressor(random_state=42), X_train, y_train, X_test, y_test)
evaluate_model("XGBoost", XGBRegressor(random_state=42, verbosity=0), X_train, y_train, X_test, y_test)
evaluate_model("LightGBM", LGBMRegressor(random_state=42, verbose=-1), X_train, y_train, X_test, y_test)
evaluate_model("Gradient Boosting", GradientBoostingRegressor(random_state=42), X_train, y_train, X_test, y_test)
evaluate_model("AdaBoost", AdaBoostRegressor(random_state=42), X_train, y_train, X_test, y_test)
evaluate_model("CatBoost", CatBoostRegressor(random_state=42, verbose=0), X_train, y_train, X_test, y_test)
train_results_df = pd.DataFrame(train_results)
train_results_df
test_results_df = pd.DataFrame(test_results)
test_results_df
采用與前面 Stacking 模型第一層基學習器相同的模型和默認參數(包括 Random Forest、XGBoost、LightGBM、Gradient Boosting、AdaBoost 和 CatBoost),分別在訓練集和測試集上評估單一模型的預測性能,通過計算 R^2、RMSE、MAE、MAPE 和解釋方差等指標,為單一模型提供基準性能,用于對比 Stacking 模型是否通過融合多模型的優勢顯著提升了預測精度和穩定性,接下來對這些評價指標進行一個簡單的可視化。
import seaborn as sns
def plot_model_performance(results_df, dataset_type="Train", save_path=None):
colors = sns.color_palette("Set2", len(results_df))
long_format = results_df.melt(
id_vars=["Model"],
var_name="Metric",
value_name="Value"
)
plt.figure(figsize=(12, 6))
sns.barplot(data=long_format, x="Metric", y="Value", hue="Model", palette=colors)
plt.title(f"{dataset_type} Set Performance Metrics", fontsize=16)
plt.ylabel("Value", fontsize=12)
plt.xlabel("Metrics", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.ylim(0, 1.2) # 設置Y軸范圍,確保留白
plt.legend(title="Models", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
if save_path:
plt.savefig(save_path, format="pdf", bbox_inches="tight", dpi=300)
plt.tight_layout()
plt.show()
# 繪制訓練集指標可視化
plot_model_performance(train_results_df, dataset_type="Train", save_path="train_metrics.pdf")
# 繪制測試集指標可視化
plot_model_performance(test_results_df.iloc[:,2::], dataset_type="Test", save_path="test_metrics.pdf")
通過繪制柱狀圖直觀對比單一模型之間在訓練集和測試集上的性能指標,在單一模型和Stacking模型相比,特別是測試集上的評價指標如R^2、RMSE、MAE 和 MAPE,總體來看,Stacking 模型的性能普遍優于單一模型,尤其在R^2和 RMSE等關鍵指標上表現更優,顯示出更強的泛化能力。然而,也有個別單一模型(如XGBoost或LightGBM)在測試集上的表現接近 Stacking,但Stacking依然稍勝一籌。