ValueError: Booster.get_score() results in empty. This maybe caused by having all trees as decision dumps.
import os
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
import matplotlib.pyplot as plt
import numpy as np
import docx
dir_path = 'F:/Test/Test/Test/Test/Test'
pre_path = 'F:/Test/Test/Test/Test/Test/Pre'
excel_files = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dir_path) for f in filenames if f.endswith('.xlsx')]
# 创建结果文档
result_doc = docx.Document()
for file in excel_files:
xls = pd.ExcelFile(file)
df_old = pd.DataFrame()
mse = pd.DataFrame()
mae = pd.DataFrame()
r2 = pd.DataFrame()
y_pred = pd.DataFrame()
for sheet_index, sheet_name in enumerate(xls.sheet_names):
df = pd.read_excel(xls, sheet_name=sheet_name)
df = df.sort_values(by='日期')
testing_cutoff = int(0.75 * len(df))
train_df, test_df = df[:testing_cutoff], df[testing_cutoff:]
x_train, y_train = train_df.iloc[:, 6:21], train_df.iloc[:, 21]
x_test, y_test = test_df.iloc[:, 6:21], test_df.iloc[:, 21]
if sheet_index == 0:
df = pd.read_excel(xls, sheet_name=sheet_name)
df = df.sort_values(by='日期')
testing_cutoff = int(0.75 * len(df))
train_df, test_df = df[:testing_cutoff], df[testing_cutoff:]
x_train, y_train = train_df.iloc[:, 6:21], train_df.iloc[:, 21]
x_test, y_test = test_df.iloc[:, 6:21], test_df.iloc[:, 21]
# 超参数调整
params = {
'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
'max_depth': [3, 4, 5, 6, 7],
'n_estimators': [50, 100, 150, 200, 250],
'gamma': [0, 0.01, 0.05, 0.1, 0.2],
}
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', booster='gbtree')
grid_search = GridSearchCV(xgb_model, params, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(x_train, y_train)
xgb_model = grid_search.best_estimator_
elif sheet_index < len(xls.sheet_names) - 1:
df = pd.read_excel(xls, sheet_name=sheet_name)
df = df.sort_values(by='日期')
testing_cutoff = int(0.75 * len(df))
train_df, test_df = df[:testing_cutoff], df[testing_cutoff:]
x_train, y_train = train_df.iloc[:, 6:21], train_df.iloc[:, 21]
x_test, y_test = test_df.iloc[:, 6:21], test_df.iloc[:, 21]
xgb_model.fit(x_train, y_train)
y_pred = xgb_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
df_old = df.copy()
else:
df = pd.read_excel(xls, sheet_name=sheet_name)
df = df.sort_values(by='日期')
x_p = df.iloc[:, 6:21]
y_p = xgb_model.predict(x_p)
# y_p = xgb_model.predict(np.array([x_p]))[0]
# 保存结果到result.docx
result_doc.add_paragraph(f"Excel file: {file}")
result_doc.add_paragraph(f"RMSE: {np.sqrt(mse):.5f}, MAE: {mae:.5f}, R^2: {r2:.5f}")
result_doc.add_paragraph(f"预测收益率为: rp: {y_p.item():.5f}")
result_doc.add_paragraph("\n")
# 保存预测收益率到Excel文件
pd.DataFrame({'预测收益率': [y_p]}).to_excel(os.path.join(pre_path, f'pr_{os.path.basename(file)}.xlsx'), index=False)
# 绘制特征重要性并保存为图像
plot_importance(xgb_model)
plt.savefig(os.path.join(pre_path, 'image', f'{os.path.basename(file)}.png'))
plt.close()
# 保存结果文档
result_doc.save(os.path.join(pre_path, 'result.docx'))
print("程序结束")
帮我看看这个问题怎么解决??