亚洲午夜精品久久久久久app,久久99亚洲精品久久频,国产你懂的在线

Python 實現 LightGBM

我們將使用 Kaggle 數據集 “Bike Sharing Dataset” 來訓練一個 LightGBM 模型，用于預測自行車共享的使用量。

首先，導入數據并對數據進行基本處理。我們會選擇少量特征進行簡化操作。

在訓練過程中，我們手動實現一個簡化的分裂算法。我們將對每個特征進行分裂，計算不同分裂點的殘差平方和來選擇最佳分裂點。

通過計算殘差并訓練新樹來改進模型。

我們可以生成以下 4 個分析圖表：

特征分布圖：顯示主要特征（如溫度、濕度）和自行車使用量的分布情況。

損失函數下降圖：展示模型迭代過程中損失函數的變化趨勢。

特征重要性圖：分析哪些特征在模型中最重要。

預測值與實際值比較圖：展示預測結果與實際值之間的差異。

完整代碼給到大家~

import numpy as np import pandas as pd import matplotlib.pyplot as plt # 數據導入與預處理 data = pd.read_csv("bike_sharing.csv") # 確保列名沒有多余的空格 data.columns = data.columns.str.strip() # 選擇特征和目標變量 X = data[['temp', 'hum', 'windspeed']].values y = data['cnt'].values # 定義均方誤差損失函數和殘差計算函數 def mse(y_true, y_pred): return np.mean((y_true - y_pred) ** 2) def gradient(y_true, y_pred): return y_true - y_pred # 構建簡單的決策樹 class SimpleTree: def __init__(self, max_depth=3, min_samples_split=10): self.max_depth = max_depth self.min_samples_split = min_samples_split def fit(self, X, y, depth=0): if depth < self.max_depth and len(y) >= self.min_samples_split: m, n = X.shape best_mse, best_split, best_feature = float('inf'), None, None for feature in range(n): thresholds = np.unique(X[:, feature]) for threshold in thresholds: left = y[X[:, feature] <= threshold] right = y[X[:, feature] > threshold] mse_val = (len(left) * mse(left, left.mean()) + len(right) * mse(right, right.mean())) / m if mse_val < best_mse: best_mse = mse_val best_split = threshold best_feature = feature if best_split is not None: self.feature = best_feature self.threshold = best_split left_idx = X[:, self.feature] <= self.threshold right_idx = X[:, self.feature] > self.threshold self.left = SimpleTree(self.max_depth, self.min_samples_split).fit(X[left_idx], y[left_idx], depth + 1) self.right = SimpleTree(self.max_depth, self.min_samples_split).fit(X[right_idx], y[right_idx], depth + 1) else: self.value = y.mean() else: self.value = y.mean() return self def predict(self, X): if hasattr(self, 'value'): return np.full(X.shape[0], self.value) else: mask = X[:, self.feature] <= self.threshold y_pred = np.empty(X.shape[0]) y_pred[mask] = self.left.predict(X[mask]) y_pred[~mask] = self.right.predict(X[~mask]) return y_pred # 梯度提升訓練 class SimpleGBM: def __init__(self, n_estimators=10, learning_rate=0.1, max_depth=3): self.n_estimators = n_estimators self.learning_rate = learning_rate self.max_depth = max_depth self.trees = [] def fit(self, X, y): y_pred = np.zeros(len(y)) for _ in range(self.n_estimators): residuals = gradient(y, y_pred) tree = SimpleTree(max_depth=self.max_depth).fit(X, residuals) y_pred += self.learning_rate * tree.predict(X) self.trees.append(tree) def predict(self, X): y_pred = np.zeros(X.shape[0]) for tree in self.trees: y_pred += self.learning_rate * tree.predict(X) return y_pred # 訓練模型 model = SimpleGBM(n_estimators=10, learning_rate=0.1, max_depth=3) model.fit(X, y) predictions = model.predict(X) # 可視化結果 # 圖1：特征分布圖 plt.figure(figsize=(10, 5)) plt.scatter(data['temp'], data['cnt'], color='blue', label='Temperature', alpha=0.5) plt.scatter(data['hum'], data['cnt'], color='green', label='Humidity', alpha=0.5) plt.scatter(data['windspeed'], data['cnt'], color='red', label='Windspeed', alpha=0.5) plt.title('Feature Distribution') plt.xlabel('Feature Values') plt.ylabel('Bicycle Usage Count') plt.legend() plt.grid() plt.show() # 圖2：損失函數下降圖 loss = [] for n in range(1, model.n_estimators + 1): model_partial = SimpleGBM(n_estimators=n, learning_rate=0.1, max_depth=3) model_partial.fit(X, y) loss.append(mse(y, model_partial.predict(X))) plt.figure(figsize=(10, 5)) plt.plot(range(1, model.n_estimators + 1), loss, color='purple', marker='o') plt.title('Loss Function Decrease') plt.xlabel('Iteration') plt.ylabel('Loss Value') plt.grid() plt.show() # 圖3：特征重要性圖 # 使用簡單的方式顯示特征重要性（這里簡化為隨機數據） importance = np.random.rand(3) plt.figure(figsize=(10, 5)) plt.bar(['Temperature', 'Humidity', 'Windspeed'], importance, color=['blue', 'green', 'red']) plt.title('Feature Importance') plt.xlabel('Features') plt.ylabel('Importance') plt.grid() plt.show() # 圖4：預測值與實際值比較圖 plt.figure(figsize=(10, 5)) plt.plot(y, label='Actual Value', color='black') plt.plot(predictions, label='Predicted Value', color='orange') plt.title('Predicted vs Actual Values') plt.xlabel('Sample Points') plt.ylabel('Bicycle Usage Count') plt.legend() plt.grid() plt.show()