我从kaggle上找了一段代码,是python下进行部分依赖图绘制的。但是不知道为什么就老是报错module pdpbox.pdp has no attribute pdp_isolate
代码如下
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns #for plotting
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz #plot tree
from sklearn.metrics import roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.model_selection import train_test_split #for data splitting
import eli5 #for purmutation importance
from eli5.sklearn import PermutationImportance
import shap #for SHAP values
np.random.seed(123) #ensure reproducibility
pd.options.mode.chained_assignment = None #hide any pandas warnings
from pdpbox import pdp, info_plots #for partial plots
df= pd.read_csv("./测试用填补后的数据 12.5.csv",encoding = "GB2312")
ind_col = [col for col in df.columns if col != '是否高血压']
dep_col = '是否高血压'
X = df[ind_col]
y = df[dep_col]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
y_pred_quant = model.predict_proba(X_test)[:, 1]
y_pred_bin = model.predict(X_test)
base_features = df.columns.values.tolist()
base_features.remove('是否高血压')
feat_name = '年龄'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
然后就会提示module pdpbox.pdp has no attribute pdp_isolate,我在网上找了半天也没有答案。
我把代码精简了一下,也已经重新卸载和安装了pdpbox,现在代码是这样
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier #for the model
from sklearn.model_selection import train_test_split #for data splitting
from pdpbox import pdp, info_plots
np.random.seed(123) #ensure reproducibility
df= pd.read_csv("./测试用填补后的数据 12.5.csv",encoding = "GB2312")
ind_col = [col for col in df.columns if col != '是否高血压']
dep_col = '是否高血压'
X = df[ind_col]
y = df[dep_col]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1)
model = RandomForestClassifier()
model.fit(X_train, y_train)
base_features = df.columns.values.tolist()
base_features.remove('是否高血压')
feat_name = '年龄'
pdp_dist = pdp.pdp_isolate(model=model, dataset=X_test, model_features=base_features, feature=feat_name)
pdp.pdp_plot(pdp_dist, feat_name)
plt.show()
然后还是提示module 'pdpbox.pdp' has no attribute 'pdp_isolate'