
如何高效爬取全球新聞網站 – 整合Scrapy、Selenium與Mediastack API實現自動化新聞采集
2. 迭代訓練弱分類器:
對于每一輪?t,執行以下步驟:
我們使用Kaggle上的Iris
數據集來分類不同類型的鳶尾花。我們將代碼從零開始實現Adaboost算法,包含數據預處理、模型訓練和圖形可視化。
在代碼中:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# 讀取并處理數據集
data = pd.read_csv("Iris.csv")
data = data[data["Species"] != "Iris-virginica"] # 選擇兩個分類,便于二分類
X = data[["SepalLengthCm", "SepalWidthCm"]].values
y = np.where(data["Species"] == "Iris-setosa", -1, 1) # 將類別映射為-1和1
# 分割數據集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Adaboost的基礎實現
class WeakClassifier:
def __init__(self):
self.threshold = None
self.feature_index = None
self.polarity = 1
def train(self, X, y, weights):
n_samples, n_features = X.shape
min_error = float('inf')
# 嘗試每個特征
for feature_i in range(n_features):
feature_values = X[:, feature_i]
possible_thresholds = np.unique(feature_values)
for threshold in possible_thresholds:
for polarity in [1, -1]:
predictions = np.ones(y.shape)
predictions[polarity * feature_values < polarity * threshold] = -1
error = np.sum(weights[y != predictions])
if error < min_error:
self.polarity = polarity
self.threshold = threshold
self.feature_index = feature_i
min_error = error
def predict(self, X):
feature_values = X[:, self.feature_index]
predictions = np.ones(X.shape[0])
predictions[self.polarity * feature_values < self.polarity * self.threshold] = -1
return predictions
class AdaBoost:
def __init__(self, n_classifiers=10):
self.n_classifiers = n_classifiers
self.classifiers = []
self.alphas = []
self.errors = []
self.sample_weights_history = []
def train(self, X, y):
n_samples, _ = X.shape
weights = np.full(n_samples, 1 / n_samples)
for _ in range(self.n_classifiers):
classifier = WeakClassifier()
classifier.train(X, y, weights)
predictions = classifier.predict(X)
error = np.dot(weights, predictions != y)
alpha = 0.5 * np.log((1 - error) / (error + 1e-10))
weights *= np.exp(-alpha * y * predictions)
weights /= np.sum(weights)
self.classifiers.append(classifier)
self.alphas.append(alpha)
self.errors.append(error)
self.sample_weights_history.append(weights.copy())
def predict(self, X):
clf_preds = [alpha * clf.predict(X) for clf, alpha in zip(self.classifiers, self.alphas)]
return np.sign(np.sum(clf_preds, axis=0))
# 訓練模型
adaboost = AdaBoost(n_classifiers=10)
adaboost.train(X_train, y_train)
# 可視化1:決策邊界
def plot_decision_boundary(X, y, model, ax):
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
ax.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', edgecolor='k', s=20)
fig, ax = plt.subplots(1, 1, figsize=(10, 6))
plot_decision_boundary(X_test, y_test, adaboost, ax)
ax.set_title("Adaboost Decision Boundary")
plt.show()
# 可視化2:樣本權重變化圖
plt.figure(figsize=(10, 6))
for i, weights in enumerate(adaboost.sample_weights_history):
plt.plot(range(1, len(weights) + 1), weights, label=f'Iteration {i + 1}')
plt.xlabel('Sample Index')
plt.ylabel('Sample Weight')
plt.title('Sample Weight Distribution Over Iterations')
plt.legend(loc='upper right')
plt.show()
# 可視化3:分類器權重變化圖
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(adaboost.alphas) + 1), adaboost.alphas, marker='o', color='b')
plt.xlabel('Iteration')
plt.ylabel('Alpha (Classifier Weight)')
plt.title('Weak Classifier Weights (Alpha) Over Iterations')
plt.grid(True)
plt.show()
# 可視化4:模型誤差隨迭代次數變化圖
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(adaboost.errors) + 1), adaboost.errors, marker='o', color='r')
plt.xlabel('Iteration')
plt.ylabel('Error Rate')
plt.title('Model Error Rate Over Iterations')
plt.grid(True)
plt.show()
1. 決策邊界圖:展示了模型在特征空間中的分類邊界,用不同顏色表示模型的分類區域,幫助直觀了解模型如何劃分樣本空間。
2. 樣本權重分布圖:顯示了每一輪訓練后樣本的權重。隨著迭代增加,錯誤分類的樣本逐漸獲得更高的權重,而容易分類的樣本權重降低。這表明模型逐漸關注難以分類的樣本。
3. 分類器權重變化圖:展示每一輪弱分類器的權重a。權重較高的分類器對最終
決策影響更大;權重的波動反映了各個弱分類器的相對重要性。
4. 模型誤差隨迭代次數變化圖:展示模型每輪的錯誤率。通常情況下,錯誤率會逐漸下降,表明模型在逐步提高對數據的擬合度。‘
這些數據分析圖表提供了Adaboost訓練過程中模型學習與調整的細節,有助于大家更深入地理解Adaboost的工作原理。