import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from ngboost import NGBRegressor
from catboost import CatBoostRegressor
# 設(shè)置字體
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
# 讀取數(shù)據(jù)
df = pd.read_excel('數(shù)據(jù).xlsx')
# 劃分特征和目標(biāo)變量
X = df.drop(['price'], axis=1)
y = df['price']
# 劃分訓(xùn)練集和測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
在數(shù)據(jù)預(yù)處理中,我們從Excel文件中讀取數(shù)據(jù),并劃分特征和目標(biāo)變量,與往期代碼不同的是,這次我們沒有單獨劃分驗證集,因為K折交叉驗證會在輸入的訓(xùn)練集中自動進(jìn)行驗證劃分,這樣,整個訓(xùn)練集都被充分利用,提高了模型評估的可靠性,同時我們的測試集保持不變,用于最終的模型評估
模型定義
# CatBoost模型參數(shù)
params_cat = {
'learning_rate': 0.02,
'iterations': 1000,
'depth': 6,
'eval_metric': 'RMSE',
'random_seed': 42,
'verbose': 100
}
# NGBoost模型參數(shù)
params_ngb = {
'learning_rate': 0.02,
'n_estimators': 1000,
'verbose': False,
'random_state': 42,
'natural_gradient': True
}
# 定義平均模型類
class AverageModel:
def __init__(self, models):
self.models = models
def fit(self, X, y, X_val, y_val):
for model in self.models:
if isinstance(model, NGBRegressor):
model.fit(X, y, X_val=X_val, Y_val=y_val) # NGBoost的fit方法接受驗證集參數(shù)
elif isinstance(model, CatBoostRegressor):
model.fit(X, y, eval_set=(X_val, y_val), use_best_model=True, verbose=False)
def predict(self, X):
predictions = []
for model in self.models:
predictions.append(model.predict(X))
return sum(predictions) / len(predictions)
定義CatBoost和NGBoost模型的參數(shù),并通過一個名為AverageModel的類來訓(xùn)練這些模型,該類在訓(xùn)練時根據(jù)不同模型調(diào)用相應(yīng)的fit方法并在預(yù)測時平均所有模型的預(yù)測結(jié)果,當(dāng)然也可以改進(jìn)組合策略在預(yù)測時通過不同的策略(如加權(quán)平均、投票等)綜合所有模型的預(yù)測結(jié)果
# 定義k折交叉驗證
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
best_score = float('inf')
best_model = None
# 進(jìn)行k折交叉驗證
for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
# 計算當(dāng)前訓(xùn)練集的均值和標(biāo)準(zhǔn)差
x_mean_fold = X_train_fold.mean()
x_std_fold = X_train_fold.std()
y_mean_fold = y_train_fold.mean()
y_std_fold = y_train_fold.std()
# 標(biāo)準(zhǔn)化
X_train_fold = (X_train_fold - x_mean_fold) / x_std_fold
y_train_fold = (y_train_fold - y_mean_fold) / y_std_fold
X_val_fold = (X_val_fold - x_mean_fold) / x_std_fold
y_val_fold = (y_val_fold - y_mean_fold) / y_std_fold
# 創(chuàng)建和訓(xùn)練組合模型
current_model = AverageModel([NGBRegressor(**params_ngb), CatBoostRegressor(**params_cat)])
current_model.fit(X_train_fold, y_train_fold, X_val_fold, y_val_fold)
# 預(yù)測并計算得分
y_val_pred = current_model.predict(X_val_fold)
score = mean_squared_error(y_val_fold * y_std_fold + y_mean_fold, y_val_pred * y_std_fold + y_mean_fold, squared=False) # 反標(biāo)準(zhǔn)化得分
scores.append(score)
print(f'Fold {fold + 1} RMSE: {score}')
# 保留得分最好的模型
if score < best_score:
best_score = score
best_model = current_model
print(f'Best RMSE: {best_score}')
定義了5折交叉驗證,通過對每折的訓(xùn)練集和驗證集進(jìn)行標(biāo)準(zhǔn)化后訓(xùn)練NGBRegressor和CatBoostRegressor組合模型,并在每折計算驗證集的RMSE來評估模型性能,最終選出得分最好的模型作為最佳模型
模型預(yù)測
# 使用最佳模型進(jìn)行測試集預(yù)測
x_mean_final = X_train.mean()
x_std_final = X_train.std()
X_test_standardized = (X_test - x_mean_final) / x_std_final
y_test_standardized = (y_test - y.mean()) / y.std()
y_test_pred_standardized = best_model.predict(X_test_standardized)
y_test_pred = y_test_pred_standardized * y.std() + y.mean()
# 輸出反標(biāo)準(zhǔn)化后的預(yù)測結(jié)果
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
使用訓(xùn)練集的均值和標(biāo)準(zhǔn)差對測試集進(jìn)行標(biāo)準(zhǔn)化,并用最佳模型預(yù)測測試集的目標(biāo)值,再將預(yù)測結(jié)果反標(biāo)準(zhǔn)化以輸出實際值和預(yù)測值的比較
詳細(xì)評價指標(biāo)
from sklearn import metrics
import numpy as np
y_pred_list = y_test_pred.tolist() # 或者 y_pred_array = np.array(y_pred)
mse = metrics.mean_squared_error(y_test, y_pred_list)
rmse = np.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, y_pred_list)
r2 = metrics.r2_score(y_test, y_pred_list)
print("均方誤差 (MSE):", mse)
print("均方根誤差 (RMSE):", rmse)
print("平均絕對誤差 (MAE):", mae)
print("擬合優(yōu)度 (R-squared):", r2)
可視化對比
# 可視化 results 數(shù)據(jù)框
plt.figure(figsize=(10, 6), dpi=300)
plt.scatter(results['Predicted'], results['Actual'], color='blue', edgecolor='k', s=50, alpha=0.6, label='預(yù)測值 vs 真實值')
plt.title('預(yù)測值與真實值對比圖', fontsize=16)
plt.xlabel('預(yù)測值', fontsize=14)
plt.ylabel('真實值', fontsize=14)
max_val = max(results.max())
min_val = min(results.min())
plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--', linewidth=2, label='x=y')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.show()
本文章轉(zhuǎn)載微信公眾號@Python機(jī)器學(xué)習(xí)AI