import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
df = pd.read_excel('california.xlsx')
from sklearn.model_selection import train_test_split, KFold
X = df.drop(['price'],axis=1)
y = df['price']
# 劃分訓(xùn)練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
使用pandas讀取Excel文件,加載數(shù)據(jù)集,將數(shù)據(jù)集中的特征和目標(biāo)變量分開,特征為X,目標(biāo)為y,使用train_test_split將數(shù)據(jù)集分為訓(xùn)練集和測試集,比例為80%訓(xùn)練集和20%測試集
from sklearn.metrics import root_mean_squared_error
from catboost import CatBoostRegressor
# CatBoost模型參數(shù)
params_cat = {
'learning_rate': 0.02, # 學(xué)習(xí)率,控制每一步的步長,用于防止過擬合。典型值范圍:0.01 - 0.1
'iterations': 1000, # 弱學(xué)習(xí)器(決策樹)的數(shù)量
'depth': 6, # 決策樹的深度,控制模型復(fù)雜度
'eval_metric': 'RMSE', # 評估指標(biāo),這里使用均方根誤差(Root Mean Squared Error,簡稱RMSE)
'random_seed': 42, # 隨機(jī)種子,用于重現(xiàn)模型的結(jié)果
'verbose': 500 # 控制CatBoost輸出信息的詳細(xì)程度,每100次迭代輸出一次
}
# 準(zhǔn)備k折交叉驗證
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
best_score = np.inf
best_model = None
# 交叉驗證
for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
model = CatBoostRegressor(**params_cat)
model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100)
# 預(yù)測并計算得分
y_val_pred = model.predict(X_val_fold)
score = root_mean_squared_error(y_val_fold, y_val_pred) # RMSE
scores.append(score)
print(f'第 {fold + 1} 折 RMSE: {score}')
# 保存得分最好的模型
if score < best_score:
best_score = score
best_model = model
print(f'最佳 RMSE: {best_score}')
采用KFold進(jìn)行5折交叉驗證,確保模型的穩(wěn)定性和泛化能力,使用CatBoost回歸模型訓(xùn)練數(shù)據(jù)。CatBoost是一種基于決策樹的梯度提升算法,適用于處理分類和回歸任務(wù),設(shè)置了一些模型參數(shù),如學(xué)習(xí)率、迭代次數(shù)、決策樹深度等,通過交叉驗證,計算每一折的均方根誤差(RMSE),選取表現(xiàn)最好的模型作為最終模型
模型評估
from sklearn import metrics
# 預(yù)測
y_pred_four = best_model.predict(X_test)
y_pred_list = y_pred_four.tolist()
mse = metrics.mean_squared_error(y_test, y_pred_list)
rmse = np.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred_list)
r2 = metrics.r2_score(y_test, y_pred_list)
print("均方誤差 (MSE):", mse)
print("均方根誤差 (RMSE):", rmse)
print("平均絕對誤差 (MAE):", mae)
print("擬合優(yōu)度 (R-squared):", r2)
使用最佳模型對測試集進(jìn)行預(yù)測,計算評估指標(biāo),包括均方誤差(MSE)、均方根誤差(RMSE)、平均絕對誤差(MAE)和擬合優(yōu)度(R-squared)
shap解釋摘要圖
import shap
# 構(gòu)建 shap解釋器
explainer = shap.TreeExplainer(best_model)
# 計算測試集的shap值
shap_values = explainer.shap_values(X_test)
# 特征標(biāo)簽
labels = X_test.columns
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Times new Roman'
plt.rcParams['font.size'] = 13
plt.figure()
shap.summary_plot(shap_values, X_test, feature_names=labels, plot_type="dot")
shap部分如何解釋參考往期文章SHAP全解析:機(jī)器學(xué)習(xí)、深度學(xué)習(xí)模型解釋保姆級教程
shap依賴圖
shap.dependence_plot('MedInc', shap_values, X_test, interaction_index='AveOccup')
shap力圖
# 繪制單個樣本的SHAP解釋(Force Plot)
sample_index = 7 # 選擇一個樣本索引進(jìn)行解釋
shap.force_plot(explainer.expected_value, shap_values[sample_index], X_test.iloc[sample_index], matplotlib=True)
shap交互作用摘要圖
shap_interaction_values = explainer.shap_interaction_values(X_test)
shap.summary_plot(shap_interaction_values, X_test)
shap熱圖
# 創(chuàng)建 shap.Explanation 對象
shap_explanation = shap.Explanation(values=shap_values[0:500,:],
base_values=explainer.expected_value,
data=X_test.iloc[0:500,:], feature_names=X_test.columns)
# 繪制熱圖
shap.plots.heatmap(shap_explanation)
部分依賴圖PDP
from sklearn.inspection import PartialDependenceDisplay
features = ['MedInc'] # 替換為你要繪制的特征
# best_model,為訓(xùn)練模型 X_test為測試集 kind為average代表繪制PDP
PartialDependenceDisplay.from_estimator(best_model, X_test, features, kind='average')
plt.grid(True, linestyle='--', alpha=0.7)
plt.title('average')
plt.show()
PDP(部分依賴圖)、ICE(個體條件期望)如何解釋參考往期文章PDP(部分依賴圖)、ICE(個體條件期望)解釋機(jī)器學(xué)習(xí)模型保姆級教程
個體條件期望ICE
features = ['MedInc']
PartialDependenceDisplay.from_estimator(best_model, X_test, features, kind='individual')
plt.grid(True, linestyle='--', alpha=0.7)
plt.title('individual')
plt.show()
2D?PDP
# 選擇兩個特征繪制2D PDP
features = ['MedInc', 'AveOccup']
# 使用 contour_kw 參數(shù)繪制2D PDP
fig, ax = plt.subplots(figsize=(10, 6))
PartialDependenceDisplay.from_estimator(
best_model,
X_test,
features=[features],
kind='average',
grid_resolution=50,
contour_kw={'cmap': 'viridis', 'alpha': 0.8},
ax=ax
)
plt.suptitle('2D Partial Dependence Plot for MedInc and AveOccup')
plt.show()
這里的所有模型解釋可視化都是針對測試集進(jìn)行的或者測試集上的某個特征、某個樣本,通過這些步驟來幫助訓(xùn)練和評估模型的性能,為模型提供可解釋性,使得能夠理解模型如何利用輸入特征來進(jìn)行預(yù)測,從而提高模型的透明度和信任度
本文章轉(zhuǎn)載微信公眾號@Python機(jī)器學(xué)習(xí)AI