圈圈456 2022-10-15 14:33 采纳率: 90.9%
浏览 267
已结题

lightgbm调用模型pkl,出现需要先调用拟合?

问题遇到的现象和发生背景

joblib调用模型,预测数据时出现raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.

用代码块功能插入代码,请勿粘贴截图
import numpy as np
import pandas as pd
import os
import collections
import lightgbm as lgb
import graphviz
import joblib
from lightgbm import LGBMClassifier
from tools import globalTool as gt
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold

class model:
    def __init__(self, f_path):
        print(f_path)
        self.data = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.model_path = ''
        self.row = 0
        self.col = 0
        self.data_columns = []  # 列名
        self.in_data(f_path)  # 初始化部分类变量

    def in_data(self,f_path):
        self.data = pd.read_csv(f_path)  # 读取数据
        y_data = self.data['oreClass']  #
        x_data = self.data.drop('oreClass', axis=1)
        # x_data = gt.lgb_dropList(x_data)
        self.data_columns = x_data.columns

        self.data_split(x_data, y_data)

    def data_split(self, x_data, y_data):
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=2020)

        self.x_train = np.asarray(x_train)
        self.x_test = np.asarray(x_test)
        self.y_train = np.asarray(y_train)
        self.y_test = np.asarray(y_test)

    def train(self):
        x_train, y_train, x_test, y_test = self.x_train, self.y_train, self.x_test, self.y_test
        kfolder = KFold(n_splits=5, shuffle=True, random_state=2020)  # 5折交叉验证
        kfold = kfolder.split(x_train, y_train)  # 返回分类后数据集的index

        oof_cb = np.zeros(len(x_train))
        pred_cb = np.asarray([np.zeros(len(x_test))], dtype=np.int64)

        param = {'boosting_type': 'gbdt',
                 'num_leaves': 20,
                 'objective': 'multiclass',
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'num_class': 45,  # 一共45种矿物
                 }  # 设置模型参数
        model_lgb = LGBMClassifier(**param)  # 创建分类器

        for train_index, vali_index in kfold:
            k_x_train = x_train[train_index]
            k_y_train = y_train[train_index]
            k_x_vali = x_train[vali_index]
            k_y_vali = y_train[vali_index]

            model_lgb = model_lgb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=1,
                                      early_stopping_rounds=15)  # eval_set:评估数据集,list类型;verbose:True显示,False不显示
            # early_stopping_rounds:提前结束模型训练

            oof_cb[vali_index] = model_lgb.predict(k_x_vali, num_iteration=model_lgb.best_iteration_)
            tmp = model_lgb.predict(x_test, num_iteration=model_lgb.best_iteration_).ravel()
            pred_cb = np.append(pred_cb, [tmp], axis=0)

        pred_cb = pred_cb[1:].T  # 去除第一行0值,得到最终的预测结果
        result_pred, result_true = [], y_test
        for line in pred_cb:
            result_pred.append(np.argmax(np.bincount(line)))

        # 计算评价指标
        acc_score = accuracy_score(result_true, result_pred)
        kappa = cohen_kappa_score(result_true, result_pred)
        macro_f1 = f1_score(result_true, result_pred, average='macro')
        precision = precision_score(result_true, result_pred, average='macro')
        recall = recall_score(result_true, result_pred, average='macro')

        print('AC score: {:.3f} Kappa:{} macro_f1:{} precision:{} recall:{}'
              .format(acc_score, kappa, macro_f1, precision, recall))

        now = gt.timeTitle()
        self.evaluate(result_true, result_pred, './分析结果/性能评价/' + now + '.csv')
        joblib.dump(model_lgb,
                    './model/saveModel_lgb/lgb_model' + now + 'ac' + '{:.3f}'.format(acc_score) + '.pkl')  # 保存模型

        # self.getImportance(model_lgb, now)
        # self.getCmat(result_true, result_pred, now)
        # self.getViewer(model_lgb)
        # self.getCmat(y_test, res)

    def loadMod(self, f_path):
        for root, dirs, files in os.walk(f_path):
            m_path = max(files)

        print('lgb_model : ', m_path)
        model_lgb = joblib.load(f_path + '/' + m_path)
        return model_lgb

    def classify(self, data):  # 用准确率最大的模型预测数据形成一列数组
        f_path = 'D:/Personality/paper/矿物识别/model/saveModel_lgb'
        model_lgb = self.loadMod(f_path)

        data = pd.DataFrame(data)
        result = model_lgb.predict(data).ravel()
        return result

    def getResult(self, f_path='D:/Personality/paper/矿物识别/datasource/classData'):
        Data = pd.DataFrame(columns=self.data_columns, dtype=float)  # 用以存储所需样本的所有元素值
        # print(Data.columns)

        files = []  # 存储当前文件夹下的所有文件名
        for root, dirs, f in os.walk(f_path):
            files = f
            break

        for file in files:
            elem = file.split('_')[-1].split('.')[0]  # 获取当前文件代表的元素
            dtmp = pd.read_csv(f_path + '/' + file, header=None)  # 读取当前元素的csv
            self.row, self.col = dtmp.shape  # 将当前元素的行列号存储到row、col中
            dtmp = np.asarray(dtmp).ravel()  # 将当前元素的值变为一列
            Data[elem] = dtmp  # 将当前元素的值存储到Data对应的元素中

        # Data = gt.lgb_dropList(Data)
        Data = np.array(Data)  # 将dataframe转换为array
        # print(Data.shape)
        # print(self.classify_prob(Data))
        result = self.classify(Data)  # 对数据进行分类预测
        print(result,result.shape)







    # ---- 计算各类性能 ----#
    def evaluate(self, y_true, y_pred, save_path):
        dt_CSV = pd.DataFrame()
        unique_Value = np.unique(list(y_true)+list(y_pred))
        print("当前种类数:", len(unique_Value))
        print('类别序号:', sorted(unique_Value))
        dt_CSV['oreClass'] = sorted(unique_Value)

        name = []
        for i in sorted(unique_Value):
            name.append(gt.no_remap(i))
        dt_CSV['name'] = name

        p_class, r_class, f_class, support_micro = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
        print(len(p_class), len(r_class), len(f_class))
        dt_CSV['precision'], dt_CSV['recall'], dt_CSV['f1'] = p_class, r_class, f_class

        dt_CSV.to_csv(save_path, index=False, encoding='utf_8_sig')




model_m = model("D:/Personality/paper/矿物识别/datasource/fullData/data20201101_160248.csv")

re = model_m.getResult()


运行结果及报错内容

img

我的解答思路和尝试过的方法

在网上查找时发现我的预测数据存在NAN,后续我将NAN转换为0,结果问题相同。还尝试将lightgbm3.3.2降到3.3.0,但问题还是存在。一直在说调用函数时要先调用拟合,但调用的模型是训练完成的pkl格式模型,这应该是没问题的。

  • 写回答

1条回答 默认 最新

  • herosunly Python领域优质创作者 2022-10-17 09:57
    关注

    得看你模型保存的方式,如果是通过save_model函数保存的模型,读取方法就得是model_lgb = lgb.Booster(model_file_path)

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 10月18日
  • 已采纳回答 10月18日
  • 创建了问题 10月15日

悬赏问题

  • ¥15 PADS Logic 原理图
  • ¥15 PADS Logic 图标
  • ¥15 电脑和power bi环境都是英文如何将日期层次结构转换成英文
  • ¥20 气象站点数据求取中~
  • ¥15 如何获取APP内弹出的网址链接
  • ¥15 wifi 图标不见了 不知道怎么办 上不了网 变成小地球了