已知每年各月对应的传染病感染人数,在运用LSTM对传染病进行预测时,训练集是2018年6月到2021年12月每个月感染人数的数据,测试集是2022年1月到2024年4月每个月感染人数的数据。对数据先后进行了一阶差分和归一化处理,结果显示测试集的预测值与真实值趋势相似,但是两者的误差非常大,想问问这是什么原因,该怎么解决?有哪些调整参数或者优化模型的方法?
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from keras.regularizers import L2
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from keras.callbacks import EarlyStopping
# 数据的差分转换
def difference(data_set, interval=1):
diff = list()
for i in range(interval, len(data_set)):
value = data_set[i] - data_set[i - interval]
diff.append(value)
return pd.Series(diff), data_set[0]
# 对预测的数据进行逆差分转换
def invert_difference(last_ob, yhat, interval=1):
return yhat + last_ob
# 将数据转换为监督学习集,移位后产生的NaN值补0
def timeseries_to_supervised(data, lag=1):
df = pd.DataFrame(data)
columns = [df.shift(i) for i in range(1, lag + 1)]
columns.append(df)
df = pd.concat(columns, axis=1)
df.fillna(0, inplace=True)
return df
# 归一化处理将数据缩放到[-1,1]之间
def scale(train, test):
scaler = MinMaxScaler(feature_range=(-1 , 1))
scaler = scaler.fit(train)
train_scaled = scaler.transform(train)
test_scaled = scaler.transform(test)
return scaler, train_scaled, test_scaled
# 将预测值进行逆缩放,使用之前训练好的缩放器,x为一维数组,y为实数
def invert_scale(scaler, X, y):
new_row = [x for x in X] + [y]
array = np.array(new_row)
array = array.reshape(1, len(array))
invert = scaler.inverse_transform(array)
return invert[0, -1]
# 构建一个LSTM模型
def fit_lstm(train, batch_size, nb_epoch, neurons, dropout=0.2, reg_lambda=0.01):
X, y = train[:, 0:-1], train[:, -1]
X = X.reshape(X.shape[0], 1, X.shape[1])
model = Sequential()
model.add(LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True, return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons, stateful=True, return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(neurons, stateful=True, kernel_regularizer=L2(reg_lambda))) # 添加L2正则化
model.add(Dropout(dropout))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='RMSProp')
# Early stopping适时停止训练
early_stopping = EarlyStopping(monitor='loss', patience=0, verbose=1)
for i in range(nb_epoch):
model.fit(X, y, epochs=1, batch_size=batch_size, verbose=1, shuffle=False, callbacks=[early_stopping])
model.reset_states()
return model
# 开始单步预测
def forecast_lstm(model, batch_size, X):
X = np.array(X)
X = X.reshape(1, 1, len(X))
yhat = model.predict(X, batch_size=batch_size)
return yhat[0, 0]
# 读取数据
excel_file_path = r"D:\pyproject\LSTM\nums.xlsx"
data = pd.read_excel(excel_file_path, engine='openpyxl', parse_dates=['时间'])
print(data.columns)
# 将时间数据转换为字符串类型,并按照年月格式进行解析
data['时间'] = data['时间'].dt.strftime('%Y年%m月')
data['时间'] = pd.to_datetime(data['时间'])
# 将原数据转换为二维数组形式
raw_values = data['发病数'].values
# 将数据进行差分转换
diff_values, first_value = difference(raw_values, 1)
diff_values = diff_values.values
# 将序列形式的数据转换为监督学习集形式
supervised = timeseries_to_supervised(diff_values, 1)
supervised_values = supervised.values
# 将数据集分割为训练集和测试集
testNum = 25
train, test = supervised_values[:43], supervised_values[-testNum:]
# 将训练集和测试集都缩放到[-1,1]之间
scaler, train_scaled, test_scaled = scale(train, test)
# 构建一个LSTM模型并训练
lstm_model = fit_lstm(train_scaled, 1, 300, 10, 0.2, reg_lambda=0.01)
# 遍历测试集,对数据进行单步预测
predictions = list()
for i in range(len(test_scaled)):
X, y = test_scaled[i, 0:-1], test_scaled[i, -1]
yhat = forecast_lstm(lstm_model, 1, X)
yhat = invert_scale(scaler, X, yhat)
last_ob = raw_values[-len(test_scaled) + i - 1] if i > 0 else first_value
yhat = invert_difference(last_ob, yhat, 1)
predictions.append(yhat)
# 获取测试集的时间索引
test_index = data.index[-testNum:]
true_values = data['发病数'].tail(testNum).values
# 打印预测值和原始数据
print("\n预测值:", predictions)
print("原始数据(测试集):", true_values)
# 计算评估指标
rmse = np.sqrt(mean_squared_error(true_values, predictions))
mae = mean_absolute_error(true_values, predictions)
r2 = r2_score(true_values, predictions)
print("Test RMSE:", rmse)
print("Test MAE:", mae)
print("Test R²:", r2)
#设置,使matplotlib能正常输出中文且符号显示不异常
import matplotlib
matplotlib.rc("font",family='SimSun')
plt.rcParams['axes.unicode_minus']=False
# 绘制预测值与实际值的图
plt.figure(figsize=(12, 6))
plt.plot(data.loc[test_index, '时间'], true_values, label='true') # 使用时间作为横坐标
plt.plot(data.loc[test_index, '时间'], predictions, label='predict') # 使用时间作为横坐标
plt.legend()
plt.title('附件二真实值与预测值对比')
plt.xlabel('time')
plt.ylabel('number')
plt.xticks(rotation=45)
plt.show()
# 检查预测值
print("预测值:", predictions)
以下是真实值和预测值的大小以及对比图
预测值: [55187.49152672291, -2051.1909727454176, 261.3303700089464, 9856.500291466713, 6470.358180373908, -87.71566048264413, -1116.5496807396403, 233.91898119449706, 2487.400110274554, 1839.2509351968774, -2154.191992402076, -1689.770840406417, -852.153939962386, 602.5681008100521, 5806.498772382737, 32219.26567184925, 48518.604459524155, 19874.70840358734, 13059.032410323622, 11653.124070167543, 3900.5170504450807, 1104.4910699129116, 106.93952131271476, -582.3792816400517, 223.9498740434658]
原始数据(测试集): [ 3309 5276 14545 10833 3941 2570 3575 5487 4509 204 381 960
2188 7199 33454 49632 20900 14025 12586 4823 2036 1064 414 1271
8105]
