import pandas as pd
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
df = pd.read_excel('模擬數據.xlsx')
X = df.drop(['目標變量'], axis=1)
y = df['目標變量']
# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
這里讀取數據,然后把數據分割為訓練集和測試集
logistic模型ROC曲線繪制
# 訓練邏輯回歸模型
model = LogisticRegression()
model.fit(X_train, y_train)
# 預測概率
y_score = model.predict_proba(X_test)[:, 1]
# 計算ROC曲線
fpr_logistic, tpr_logistic, _ = roc_curve(y_test, y_score)
roc_auc_logistic = auc(fpr_logistic, tpr_logistic)
# 繪制ROC曲線
plt.figure()
plt.plot(fpr_logistic, tpr_logistic, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_logistic)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
代碼訓練一個邏輯回歸模型,計算其ROC曲線及AUC值,并繪制ROC曲線圖,以評估模型的分類性能,接下來我們詳細解讀一下這個ROC曲線圖,橫軸——假陽性率該值從0到1不等,越接近0表示模型越不容易誤報,縱軸——真陽性率該值從0到1不等,越接近1表示模型越能夠正確識別正類,曲線下方的面積(AUC)表示模型的整體性能,AUC值越大表示模型性能越好,AUC值在0.9到1之間表示模型非常好;在0.8到0.9之間表示模型良好;在0.7到0.8之間表示模型一般,藍色虛線表示隨機猜測的模型的ROC曲線,AUC為0.5,如果模型的ROC曲線在這條線的上方,說明模型優于隨機猜測;如果在下方,說明模型劣于隨機猜測,具體的解讀如下:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb
# 訓練XGBoost模型
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_score_xgb = xgb_model.predict_proba(X_test)[:, 1]
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_score_xgb)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)
# 訓練SVM模型
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)
y_score_svm = svm_model.predict_proba(X_test)[:, 1]
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_score_svm)
roc_auc_svm = auc(fpr_svm, tpr_svm)
# 訓練隨機森林模型
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_score_rf = rf_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_score_rf)
roc_auc_rf = auc(fpr_rf, tpr_rf)
# 繪制ROC曲線
plt.figure()
plt.plot(fpr_logistic, tpr_logistic, color='darkorange', lw=2, label='Logistic ROC curve (area = %0.2f)' % roc_auc_logistic)
plt.plot(fpr_xgb, tpr_xgb, color='green', lw=2, label='XGBoost ROC curve (area = %0.2f)' % roc_auc_xgb)
plt.plot(fpr_svm, tpr_svm, color='purple', lw=2, label='SVM ROC curve (area = %0.2f)' % roc_auc_svm)
plt.plot(fpr_rf, tpr_rf, color='red', lw=2, label='Random Forest ROC curve (area = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()
代碼訓練了XGBoost、SVM和隨機森林模型,計算了各自的ROC曲線及AUC值,并繪制了包含所有模型的ROC曲線圖,以比較它們的分類性能
詳細評價指標
from sklearn.metrics import classification_report, confusion_matrix
# 計算并輸出分類報告的函數
def print_classification_report(model, X_test, y_test, model_name):
y_pred = model.predict(X_test)
print(f"{model_name} Classification Report:")
print(classification_report(y_test, y_pred))
print(f"{model_name} Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n")
print_classification_report(model, X_test, y_test, "Logistic Regression")
print_classification_report(xgb_model, X_test, y_test, "XGBoost")
print_classification_report(svm_model, X_test, y_test, "SVM")
print_classification_report(rf_model, X_test, y_test, "Random Forest")
最后輸出邏輯回歸、XGBoost、SVM和隨機森林模型的詳細評價指標
本文章轉載微信公眾號@Python機器學習AI