数据集是我们实验室实际测出来的,然后我做的随机森林回归模型计算出的特征重要性,跟理论上的重要性顺序不一致,但是同样的数据集用matlab就正确了,不知道是不是我写的有问题,或者是随机森林回归器选错了,但是模型对测试集的准确率非常高,希望有人能提出解决思路或帮忙解决,有偿
这是我写的python 代码
import shap
import pandas as pd
import numpy as np
import tqdm as notebook_tqdm
from sklearn.model_selection import train_test_split
shap.initjs() # notebook环境下,加载用于可视化的JS代码
np.set_printoptions(threshold=np.inf)
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification
data_url = "data"
raw_df = pd.read_csv(data_url)
print(raw_df.head())
print(raw_df.columns)
#print(raw_df.values[:,0:1])
X=raw_df.values[:,1:]
#print(X)
y=raw_df.values[:,0:1]
#print(y)
X_columns=['C', 'T', 'P', 'η', 'ρ', 'M', 'PKa', 'μ']
print(X.shape)
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3)
print(X.shape)
# 特征缩放
# 因为数据单位,自变量数值范围差距巨大
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)
X_test = sc.transform(X_test)
# 创建随机森林回归器
model =RandomForestRegressor(
n_estimators=1000,
criterion='squared_error',
random_state=0,
n_jobs=-1)
#model =RandomForestRegressor()
# 训练模型
model.fit(X, y.astype('double').ravel())
#print(y.astype('double').ravel())
# 在测试集上评估模型
accuracy = model.score(X_test, y_test.astype('double'))
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# 计算特征重要性
importance = model.feature_importances_
# 打印每个特征的重要性
print(importance)
#返回数组从大到小的索引值
indices = np.argsort(importance)[::-1]
for f in range(X.shape[1]):
print("%2d) %-*s %f" % (f + 1, 30, X_columns[indices[f]], importance[indices[f]]))
#创建xgboost
#model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)
#shap
explainer = shap.Explainer(model)
#print(X)
#print(y)
shap_values = explainer.shap_values(X) # 传入特征矩阵X,计算SHAP值
#print(shap_values)
shap.summary_plot(shap_values, X,feature_names=X_columns)
#shap.force_plot(explainer.expected_value, shap_values,X,feature_names=X_columns)
shap.summary_plot(shap_values, X, plot_type="bar",feature_names=X_columns)
补充一下代码执行的结果,
实际情况下应该是P>T>C
这是matlab用的treebagger参数
%随机森林建立和训练
nTree = 50;
KgAv = TreeBagger(nTree, p, t,'Method','regression','OOBPrediction','on', 'PredictorSelection','curvature',...
'OOBPredictorImportance','On','MinLeafSize',1);
这是matlab输出的特征重要性