import matplotlib.pyplot as plt
import numpy as np
plt.rcParams['font.sans-serif'] = 'SimHei' # 設置中文顯示
plt.rcParams['axes.unicode_minus'] = False

df = pd.read_excel('data.xlsx')

2.2 數據預處理

2.2.1 數據轉換及缺失檢測

df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Day'].astype(str), format='%Y-%j')
df.set_index('Date', inplace=True)
df.drop(['Year', 'Day'], axis=1, inplace=True)

# 生成時間范圍
start_date = pd.Timestamp('1990-01-01')
end_date = pd.Timestamp('2023-03-01')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# 檢查時間范圍中是否包含DataFrame中的所有日期
missing_dates = date_range[~date_range.isin(df.index)]
print("Missing Dates:")

代碼將DataFrame中的“Year”和“Day”列合并成日期,并設置為DataFrame的索引,然后生成一個時間范圍,檢查該范圍中是否包含了DataFrame中的所有日期,避免時間范圍不完整存在缺失。

2.2.2 數據劃分

# 定義劃分比例
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

# 計算劃分的索引
train_split = int(train_ratio * len(df))
val_split = int((train_ratio + val_ratio) * len(df))

# 劃分數據集
train_set = df.iloc[:train_split]
val_set = df.iloc[train_split:val_split]
test_set = df.iloc[val_split:]

plt.figure(figsize=(15, 10))
plt.subplot(3,1,1)
plt.plot(train_set, color='g', alpha=0.3)
plt.title('train Temperature時序圖')

plt.subplot(3,1,2)
plt.plot(val_set, color='b', alpha=0.3)
plt.title('val Temperature時序圖')

plt.subplot(3,1,3)
plt.plot(test_set, color='r', alpha=0.3)
plt.title('test Temperature時序圖')
plt.xticks(rotation=45)
plt.show()

數據集按照指定的比例劃分為訓練集、驗證集和測試集,并繪制它們的時序圖,訓練集用于訓練模型,驗證集用于調整模型超參數和評估性能,測試集用于評估模型在未知數據上的性能。

2.2.3 歸一化數據

from sklearn.preprocessing import MinMaxScaler

def normalize_dataframe(train_set, val_set, test_set):
scaler = MinMaxScaler()
scaler.fit(train_set) # 在訓練集上擬合歸一化模型

train = pd.DataFrame(scaler.transform(train_set), columns=train_set.columns, index = train_set.index)
val = pd.DataFrame(scaler.transform(val_set), columns=val_set.columns, index = val_set.index)
test = pd.DataFrame(scaler.transform(test_set), columns=test_set.columns, index = test_set.index)
return train, val, test

train, val, test = normalize_dataframe(train_set, val_set, test_set)

plt.figure(figsize=(15, 10))
plt.subplot(3,1,1)
plt.plot(train, color='g', alpha=0.3)
plt.title('train Temperature歸一化時序圖')

plt.subplot(3,1,2)
plt.plot(val, color='b', alpha=0.3)
plt.title('val Temperature歸一化時序圖')

plt.subplot(3,1,3)
plt.plot(test, color='r', alpha=0.3)
plt.title('test Temperature歸一化時序圖')
plt.xticks(rotation=45)
plt.show()

將訓練集、驗證集和測試集進行歸一化,并繪制歸一化后的時序圖,這里歸一化采用訓練集統計指標避免出現數據泄露。

2.2.4 時間窗口劃分

def prepare_data(data, win_size):
X = []
y = []

for i in range(len(data) - win_size):
temp_x = data[i:i + win_size]
temp_y = data[i + win_size]
X.append(temp_x)
y.append(temp_y)

X = np.asarray(X)
y = np.asarray(y)
X = np.expand_dims(X, axis=-1)
return X, y

win_size = 30

# 訓練集
X_train, y_train= prepare_data(train['Temperature'].values, win_size)

# 驗證集
X_val, y_val= prepare_data(val['Temperature'].values, win_size)

# 測試集
X_test, y_test = prepare_data(test['Temperature'].values, win_size)

print("訓練集形狀:", X_train.shape, y_train.shape)
print("驗證集形狀:", X_val.shape, y_val.shape)
print("測試集形狀:", X_test.shape, y_test.shape)

這里的劃分為單特征單步預測時間窗口為30。

2.3 BiLSTM模型構建

2.3.1 BiLSTM模型編譯訓練

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense

model_bilstm = Sequential()
model_bilstm.add(Bidirectional(LSTM(128, activation='relu'), input_shape=(X_train.shape[1], X_train.shape[2])))
model_bilstm.add(Dense(64, activation='relu'))
model_bilstm.add(Dense(32, activation='relu'))
model_bilstm.add(Dense(16, activation='relu'))
model_bilstm.add(Dense(1))

model_bilstm.compile(optimizer='adam', loss='mse')
history = model_bilstm.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

plt.figure()
plt.plot(history.history['loss'], c='b', label = 'loss')
plt.plot(history.history['val_loss'], c='g', label = 'val_loss')
plt.legend()
plt.show()
model_bilstm.summary()

2.3.2 BiLSTM模型評價

from sklearn import metrics
y_pred = model_bilstm.predict(X_test)
# 計算均方誤差(MSE)
mse = metrics.mean_squared_error(y_test, np.array([i for arr in y_pred for i in arr]))
# 計算均方根誤差(RMSE)
rmse = np.sqrt(mse)
# 計算平均絕對誤差(MAE)
mae = metrics.mean_absolute_error(y_test, np.array([i for arr in y_pred for i in arr]))
from sklearn.metrics import r2_score # 擬合優度
r2 = r2_score(y_test, np.array([i for arr in y_pred for i in arr]))
print("均方誤差 (MSE):", mse)
print("均方根誤差 (RMSE):", rmse)
print("平均絕對誤差 (MAE):", mae)
print("擬合優度:", r2)

2.3.3 BiLSTM模型向后預測及可視化

# 取出預測的最后一個時間步的輸出作為下一步的輸入
last_output = model_bilstm.predict(X_test)[-1]
# 預測的時間步數
steps = 10 # 假設向后預測10個時間步
predicted = []
for i in range(steps):
# 將最后一個輸出加入X_test,繼續向后預測
input_data = np.append(X_test[-1][1:], last_output).reshape(1, X_test.shape[1], X_test.shape[2])
# 使用模型進行預測
next_output = model_bilstm.predict(input_data)
# 將預測的值加入結果列表
predicted.append(next_output[0][0])
last_output = next_output[0]

# 反歸一化
df_max = np.max(train_set)
df_min = np.min(train_set)

series_1 = np.array(predicted)*(df_max-df_min)+df_min

plt.figure(figsize=(15,4), dpi =300)
plt.subplot(3,1,1)
plt.plot(train_set, color = 'c', label = '訓練集')
plt.plot(val_set, color = 'r', label = '驗證集')
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_1, color = 'magenta',linestyle='-.', label = '未來預測')
plt.legend()
plt.subplot(3,1,2)
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_1, color = 'magenta', linestyle='-.',label = '未來預測')
plt.legend()

plt.subplot(3,1,3)
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_1, color = 'magenta',linestyle='-.', label = '未來預測')
# 設置x軸范圍為2022年到未來預測的結束日期
plt.xlim(pd.Timestamp('2022-01-01'), pd.Timestamp('2023-03-11'))
plt.legend()
plt.show()

2.4 CNN-BiLSTM模型構建

2.4.1CNN-BiLSTM模型編譯訓練

from tensorflow.keras.layers import Conv1D, MaxPooling1D, Reshape, Flatten
model_cnn_bilstm = Sequential()
model_cnn_bilstm.add(Bidirectional(LSTM(128, activation='relu'), input_shape=(X_train.shape[1], X_train.shape[2])))
# 添加Reshape層將LSTM的輸出轉換為3維
model_cnn_bilstm.add(Reshape((256, 1)))
model_cnn_bilstm.add(Conv1D(filters=64, kernel_size=7, activation='relu'))
model_cnn_bilstm.add(MaxPooling1D(pool_size=2))
model_cnn_bilstm.add(Flatten()) # 將池化后的輸出展平成一維向量
model_cnn_bilstm.add(Dense(32, activation='relu'))
model_cnn_bilstm.add(Dense(16, activation='relu'))
model_cnn_bilstm.add(Dense(1))

model_cnn_bilstm.compile(optimizer='adam', loss='mse')
history = model_cnn_bilstm.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

plt.figure()
plt.plot(history.history['loss'], c='b', label = 'loss')
plt.plot(history.history['val_loss'], c='g', label = 'val_loss')
plt.legend()
plt.show()
model_cnn_bilstm.summary()

2.4.2CNN-BiLSTM模型評價

y_pred = model_cnn_bilstm.predict(X_test)
mse = metrics.mean_squared_error(y_test, np.array([i for arr in y_pred for i in arr]))
rmse = np.sqrt(mse)
mae = metrics.mean_absolute_error(y_test, np.array([i for arr in y_pred for i in arr]))
from sklearn.metrics import r2_score
r2 = r2_score(y_test, np.array([i for arr in y_pred for i in arr]))
print("均方誤差 (MSE):", mse)
print("均方根誤差 (RMSE):", rmse)
print("平均絕對誤差 (MAE):", mae)
print("擬合優度:", r2)

2.4.3?CNN-BiLSTM模型向后預測及可視化

last_output = model_cnn_bilstm.predict(X_test)[-1]
steps = 10
predicted = []
for i in range(steps):
input_data = np.append(X_test[-1][1:], last_output).reshape(1, X_test.shape[1], X_test.shape[2])
next_output = model_cnn_bilstm.predict(input_data)
predicted.append(next_output[0][0])
last_output = next_output[0]

series_2 = np.array(predicted)*(df_max-df_min)+df_min

plt.figure(figsize=(15,4), dpi =300)
plt.subplot(3,1,1)
plt.plot(train_set, color = 'c', label = '訓練集')
plt.plot(val_set, color = 'r', label = '驗證集')
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_2, color = 'magenta',linestyle='-.', label = '未來預測')
plt.legend()
plt.subplot(3,1,2)
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_2, color = 'magenta',linestyle='-.', label = '未來預測')
plt.legend()

plt.subplot(3,1,3)
plt.plot(test_set, color = 'b', label = '測試集')
plt.plot(pd.date_range(start='2016-08-12', end='2023-03-01', freq='D')
,y_pred*(df_max-df_min)+df_min, color = 'y', label = '測試集預測')

plt.plot(pd.date_range(start='2023-03-02', end='2023-03-11', freq='D')
,series_2, color = 'magenta',linestyle='-.', label = '未來預測')
# 設置x軸范圍為2022年到未來預測的結束日期
plt.xlim(pd.Timestamp('2022-01-01'), pd.Timestamp('2023-03-11'))
plt.legend()
plt.show()

文章轉自微信公眾號@Python機器學習AI

上一篇:

探討EMD數據泄露問題的時序預測模型:EMD-CNN-LSTM實現與分析

下一篇:

時間窗口劃分:時序預測模型的多種形式解析

我們有何不同?

API服務商零注冊

多API并行試用

數據驅動選型,提升決策效率

查看全部API→
??

熱門場景實測,選對API

#AI文本生成大模型API

對比大模型API的內容創意新穎性、情感共鳴力、商業轉化潛力

25個渠道
一鍵對比試用API 限時免費

#AI深度推理大模型API

對比大模型API的邏輯推理準確性、分析深度、可視化建議合理性

10個渠道
一鍵對比試用API 限時免費