我有一个python随机森林代码,读取excel文件进行随机森林以及算100次决定系数的平均数R2,最后用shap来解释模型。但是我遇到了一个问题
在运行上面的数据时,可以运行。
但是同一段代码,在运行下面的数据时,会报错,这是为什么?明明数据都差不多。
以下是我的python代码:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
from tqdm import tqdm
# 假设您已经加载了数据
file_path = r"C:\Users\lenovo\Desktop\U2_R11.xlsx" # 替换为Excel文件路径
data = pd.read_excel(file_path)
# 准备特征和目标变量
X = data.iloc[:, :-1] # 提取所有特征,假设最后一列是目标变量
y = data.iloc[:, -1] # 提取目标变量
# 测试次数
num_iterations = 100
r2_scores = []
shap_values_list=[]
pbar = tqdm(total=num_iterations)
for i in range(num_iterations):
# 拆分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
# 训练模型
model = RandomForestRegressor(n_estimators=100, random_state=i)
model.fit(X_train, y_train)
# 在测试集上进行预测
y_pred = model.predict(X_test)
# 计算 R² 值
r2 = r2_score(y_test, y_pred)
r2_scores.append(r2) # 将 R² 值添加到列表中
# 使用 SHAP 计算 SHAP 值
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X)
shap_values_list.append(shap_values.values)
pbar.update(1)
pbar.close()
# 计算平均 R² 值和标准差
mean_r2 = np.mean(r2_scores)
std_r2 = np.std(r2_scores)
print("平均决定系数R² 经过 {} 次迭代值: {:.4f}".format(num_iterations, mean_r2))
print("R² 方差得分: {:.4f}".format(std_r2))
# 计算所有 SHAP 值的平均值
mean_shap_values = np.mean(shap_values_list, axis=0)
# 使用 SHAP 汇总可视化柱状图
plt.figure(figsize=(10, 8))
plt.title("SHAP Values Summary")
shap.summary_plot(mean_shap_values, X, plot_type="bar")
plt.show()
以下是报错内容:
4%|▍ | 4/100 [00:00<00:05, 18.92it/s]Traceback (most recent call last):
File "C:\Users\lenovo\Desktop\因果代码\因果模型\刑事侦察题目\RF.py", line 41, in
shap_values = explainer(X)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\shap\explainers_tree.py", line 233, in call
v = self.shap_values(X, y=y, from_call=True, check_additivity=check_additivity, approximate=self.approximate)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\shap\explainers_tree.py", line 446, in shap_values
self.assert_additivity(out, self.model.predict(X))
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\shap\explainers_tree.py", line 579, in assert_additivity
check_sum(self.expected_value + phi.sum(-1), model_output)
File "C:\Users\lenovo\AppData\Local\Programs\Python\Python39\lib\site-packages\shap\explainers_tree.py", line 573, in check_sum
raise ExplainerError(err_msg)
shap.utils._exceptions.ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you passed to the explainer is the same shape that the model was trained on. If your data shape is correct then please report this on GitHub. This check failed because for one of the samples the sum of the SHAP values was 7.147069, while the model output was 7.223052. If this difference is acceptable you can set check_additivity=False to disable this check.
5%|▌ | 5/100 [00:00<00:07, 12.35it/s]