问题遇到的现象和发生背景
joblib调用模型,预测数据时出现raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.
用代码块功能插入代码,请勿粘贴截图
import numpy as np
import pandas as pd
import os
import collections
import lightgbm as lgb
import graphviz
import joblib
from lightgbm import LGBMClassifier
from tools import globalTool as gt
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold
class model:
def __init__(self, f_path):
print(f_path)
self.data = None
self.x_train = None
self.x_test = None
self.y_train = None
self.y_test = None
self.model_path = ''
self.row = 0
self.col = 0
self.data_columns = [] # 列名
self.in_data(f_path) # 初始化部分类变量
def in_data(self,f_path):
self.data = pd.read_csv(f_path) # 读取数据
y_data = self.data['oreClass'] #
x_data = self.data.drop('oreClass', axis=1)
# x_data = gt.lgb_dropList(x_data)
self.data_columns = x_data.columns
self.data_split(x_data, y_data)
def data_split(self, x_data, y_data):
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=2020)
self.x_train = np.asarray(x_train)
self.x_test = np.asarray(x_test)
self.y_train = np.asarray(y_train)
self.y_test = np.asarray(y_test)
def train(self):
x_train, y_train, x_test, y_test = self.x_train, self.y_train, self.x_test, self.y_test
kfolder = KFold(n_splits=5, shuffle=True, random_state=2020) # 5折交叉验证
kfold = kfolder.split(x_train, y_train) # 返回分类后数据集的index
oof_cb = np.zeros(len(x_train))
pred_cb = np.asarray([np.zeros(len(x_test))], dtype=np.int64)
param = {'boosting_type': 'gbdt',
'num_leaves': 20,
'objective': 'multiclass',
'max_depth': 3,
'learning_rate': 0.1,
'num_class': 45, # 一共45种矿物
} # 设置模型参数
model_lgb = LGBMClassifier(**param) # 创建分类器
for train_index, vali_index in kfold:
k_x_train = x_train[train_index]
k_y_train = y_train[train_index]
k_x_vali = x_train[vali_index]
k_y_vali = y_train[vali_index]
model_lgb = model_lgb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=1,
early_stopping_rounds=15) # eval_set:评估数据集,list类型;verbose:True显示,False不显示
# early_stopping_rounds:提前结束模型训练
oof_cb[vali_index] = model_lgb.predict(k_x_vali, num_iteration=model_lgb.best_iteration_)
tmp = model_lgb.predict(x_test, num_iteration=model_lgb.best_iteration_).ravel()
pred_cb = np.append(pred_cb, [tmp], axis=0)
pred_cb = pred_cb[1:].T # 去除第一行0值,得到最终的预测结果
result_pred, result_true = [], y_test
for line in pred_cb:
result_pred.append(np.argmax(np.bincount(line)))
# 计算评价指标
acc_score = accuracy_score(result_true, result_pred)
kappa = cohen_kappa_score(result_true, result_pred)
macro_f1 = f1_score(result_true, result_pred, average='macro')
precision = precision_score(result_true, result_pred, average='macro')
recall = recall_score(result_true, result_pred, average='macro')
print('AC score: {:.3f} Kappa:{} macro_f1:{} precision:{} recall:{}'
.format(acc_score, kappa, macro_f1, precision, recall))
now = gt.timeTitle()
self.evaluate(result_true, result_pred, './分析结果/性能评价/' + now + '.csv')
joblib.dump(model_lgb,
'./model/saveModel_lgb/lgb_model' + now + 'ac' + '{:.3f}'.format(acc_score) + '.pkl') # 保存模型
# self.getImportance(model_lgb, now)
# self.getCmat(result_true, result_pred, now)
# self.getViewer(model_lgb)
# self.getCmat(y_test, res)
def loadMod(self, f_path):
for root, dirs, files in os.walk(f_path):
m_path = max(files)
print('lgb_model : ', m_path)
model_lgb = joblib.load(f_path + '/' + m_path)
return model_lgb
def classify(self, data): # 用准确率最大的模型预测数据形成一列数组
f_path = 'D:/Personality/paper/矿物识别/model/saveModel_lgb'
model_lgb = self.loadMod(f_path)
data = pd.DataFrame(data)
result = model_lgb.predict(data).ravel()
return result
def getResult(self, f_path='D:/Personality/paper/矿物识别/datasource/classData'):
Data = pd.DataFrame(columns=self.data_columns, dtype=float) # 用以存储所需样本的所有元素值
# print(Data.columns)
files = [] # 存储当前文件夹下的所有文件名
for root, dirs, f in os.walk(f_path):
files = f
break
for file in files:
elem = file.split('_')[-1].split('.')[0] # 获取当前文件代表的元素
dtmp = pd.read_csv(f_path + '/' + file, header=None) # 读取当前元素的csv
self.row, self.col = dtmp.shape # 将当前元素的行列号存储到row、col中
dtmp = np.asarray(dtmp).ravel() # 将当前元素的值变为一列
Data[elem] = dtmp # 将当前元素的值存储到Data对应的元素中
# Data = gt.lgb_dropList(Data)
Data = np.array(Data) # 将dataframe转换为array
# print(Data.shape)
# print(self.classify_prob(Data))
result = self.classify(Data) # 对数据进行分类预测
print(result,result.shape)
# ---- 计算各类性能 ----#
def evaluate(self, y_true, y_pred, save_path):
dt_CSV = pd.DataFrame()
unique_Value = np.unique(list(y_true)+list(y_pred))
print("当前种类数:", len(unique_Value))
print('类别序号:', sorted(unique_Value))
dt_CSV['oreClass'] = sorted(unique_Value)
name = []
for i in sorted(unique_Value):
name.append(gt.no_remap(i))
dt_CSV['name'] = name
p_class, r_class, f_class, support_micro = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
print(len(p_class), len(r_class), len(f_class))
dt_CSV['precision'], dt_CSV['recall'], dt_CSV['f1'] = p_class, r_class, f_class
dt_CSV.to_csv(save_path, index=False, encoding='utf_8_sig')
model_m = model("D:/Personality/paper/矿物识别/datasource/fullData/data20201101_160248.csv")
re = model_m.getResult()
运行结果及报错内容
我的解答思路和尝试过的方法
在网上查找时发现我的预测数据存在NAN,后续我将NAN转换为0,结果问题相同。还尝试将lightgbm3.3.2降到3.3.0,但问题还是存在。一直在说调用函数时要先调用拟合,但调用的模型是训练完成的pkl格式模型,这应该是没问题的。