圈圈456 2022-10-15 14:33 采纳率: 90.9%
浏览 240
已结题

lightgbm调用模型pkl,出现需要先调用拟合?

问题遇到的现象和发生背景

joblib调用模型,预测数据时出现raise LGBMNotFittedError("Estimator not fitted, call fit before exploiting the model.")
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.

用代码块功能插入代码,请勿粘贴截图
import numpy as np
import pandas as pd
import os
import collections
import lightgbm as lgb
import graphviz
import joblib
from lightgbm import LGBMClassifier
from tools import globalTool as gt
from sklearn.metrics import accuracy_score, confusion_matrix, cohen_kappa_score, f1_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold

class model:
    def __init__(self, f_path):
        print(f_path)
        self.data = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None
        self.model_path = ''
        self.row = 0
        self.col = 0
        self.data_columns = []  # 列名
        self.in_data(f_path)  # 初始化部分类变量

    def in_data(self,f_path):
        self.data = pd.read_csv(f_path)  # 读取数据
        y_data = self.data['oreClass']  #
        x_data = self.data.drop('oreClass', axis=1)
        # x_data = gt.lgb_dropList(x_data)
        self.data_columns = x_data.columns

        self.data_split(x_data, y_data)

    def data_split(self, x_data, y_data):
        x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.4, random_state=2020)

        self.x_train = np.asarray(x_train)
        self.x_test = np.asarray(x_test)
        self.y_train = np.asarray(y_train)
        self.y_test = np.asarray(y_test)

    def train(self):
        x_train, y_train, x_test, y_test = self.x_train, self.y_train, self.x_test, self.y_test
        kfolder = KFold(n_splits=5, shuffle=True, random_state=2020)  # 5折交叉验证
        kfold = kfolder.split(x_train, y_train)  # 返回分类后数据集的index

        oof_cb = np.zeros(len(x_train))
        pred_cb = np.asarray([np.zeros(len(x_test))], dtype=np.int64)

        param = {'boosting_type': 'gbdt',
                 'num_leaves': 20,
                 'objective': 'multiclass',
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'num_class': 45,  # 一共45种矿物
                 }  # 设置模型参数
        model_lgb = LGBMClassifier(**param)  # 创建分类器

        for train_index, vali_index in kfold:
            k_x_train = x_train[train_index]
            k_y_train = y_train[train_index]
            k_x_vali = x_train[vali_index]
            k_y_vali = y_train[vali_index]

            model_lgb = model_lgb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=1,
                                      early_stopping_rounds=15)  # eval_set:评估数据集,list类型;verbose:True显示,False不显示
            # early_stopping_rounds:提前结束模型训练

            oof_cb[vali_index] = model_lgb.predict(k_x_vali, num_iteration=model_lgb.best_iteration_)
            tmp = model_lgb.predict(x_test, num_iteration=model_lgb.best_iteration_).ravel()
            pred_cb = np.append(pred_cb, [tmp], axis=0)

        pred_cb = pred_cb[1:].T  # 去除第一行0值,得到最终的预测结果
        result_pred, result_true = [], y_test
        for line in pred_cb:
            result_pred.append(np.argmax(np.bincount(line)))

        # 计算评价指标
        acc_score = accuracy_score(result_true, result_pred)
        kappa = cohen_kappa_score(result_true, result_pred)
        macro_f1 = f1_score(result_true, result_pred, average='macro')
        precision = precision_score(result_true, result_pred, average='macro')
        recall = recall_score(result_true, result_pred, average='macro')

        print('AC score: {:.3f} Kappa:{} macro_f1:{} precision:{} recall:{}'
              .format(acc_score, kappa, macro_f1, precision, recall))

        now = gt.timeTitle()
        self.evaluate(result_true, result_pred, './分析结果/性能评价/' + now + '.csv')
        joblib.dump(model_lgb,
                    './model/saveModel_lgb/lgb_model' + now + 'ac' + '{:.3f}'.format(acc_score) + '.pkl')  # 保存模型

        # self.getImportance(model_lgb, now)
        # self.getCmat(result_true, result_pred, now)
        # self.getViewer(model_lgb)
        # self.getCmat(y_test, res)

    def loadMod(self, f_path):
        for root, dirs, files in os.walk(f_path):
            m_path = max(files)

        print('lgb_model : ', m_path)
        model_lgb = joblib.load(f_path + '/' + m_path)
        return model_lgb

    def classify(self, data):  # 用准确率最大的模型预测数据形成一列数组
        f_path = 'D:/Personality/paper/矿物识别/model/saveModel_lgb'
        model_lgb = self.loadMod(f_path)

        data = pd.DataFrame(data)
        result = model_lgb.predict(data).ravel()
        return result

    def getResult(self, f_path='D:/Personality/paper/矿物识别/datasource/classData'):
        Data = pd.DataFrame(columns=self.data_columns, dtype=float)  # 用以存储所需样本的所有元素值
        # print(Data.columns)

        files = []  # 存储当前文件夹下的所有文件名
        for root, dirs, f in os.walk(f_path):
            files = f
            break

        for file in files:
            elem = file.split('_')[-1].split('.')[0]  # 获取当前文件代表的元素
            dtmp = pd.read_csv(f_path + '/' + file, header=None)  # 读取当前元素的csv
            self.row, self.col = dtmp.shape  # 将当前元素的行列号存储到row、col中
            dtmp = np.asarray(dtmp).ravel()  # 将当前元素的值变为一列
            Data[elem] = dtmp  # 将当前元素的值存储到Data对应的元素中

        # Data = gt.lgb_dropList(Data)
        Data = np.array(Data)  # 将dataframe转换为array
        # print(Data.shape)
        # print(self.classify_prob(Data))
        result = self.classify(Data)  # 对数据进行分类预测
        print(result,result.shape)







    # ---- 计算各类性能 ----#
    def evaluate(self, y_true, y_pred, save_path):
        dt_CSV = pd.DataFrame()
        unique_Value = np.unique(list(y_true)+list(y_pred))
        print("当前种类数:", len(unique_Value))
        print('类别序号:', sorted(unique_Value))
        dt_CSV['oreClass'] = sorted(unique_Value)

        name = []
        for i in sorted(unique_Value):
            name.append(gt.no_remap(i))
        dt_CSV['name'] = name

        p_class, r_class, f_class, support_micro = precision_recall_fscore_support(y_true=y_true, y_pred=y_pred)
        print(len(p_class), len(r_class), len(f_class))
        dt_CSV['precision'], dt_CSV['recall'], dt_CSV['f1'] = p_class, r_class, f_class

        dt_CSV.to_csv(save_path, index=False, encoding='utf_8_sig')




model_m = model("D:/Personality/paper/矿物识别/datasource/fullData/data20201101_160248.csv")

re = model_m.getResult()


运行结果及报错内容

img

我的解答思路和尝试过的方法

在网上查找时发现我的预测数据存在NAN,后续我将NAN转换为0,结果问题相同。还尝试将lightgbm3.3.2降到3.3.0,但问题还是存在。一直在说调用函数时要先调用拟合,但调用的模型是训练完成的pkl格式模型,这应该是没问题的。

  • 写回答

1条回答 默认 最新

  • herosunly Python领域优质创作者 2022-10-17 09:57
    关注

    得看你模型保存的方式,如果是通过save_model函数保存的模型,读取方法就得是model_lgb = lgb.Booster(model_file_path)

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

问题事件

  • 已结题 (查看结题原因) 10月18日
  • 已采纳回答 10月18日
  • 创建了问题 10月15日

悬赏问题

  • ¥15 nginx中的CORS策略应该如何配置
  • ¥30 信号与系统实验:采样定理分析
  • ¥100 我想找人帮我写Python 的股票分析代码,有意请加mathtao
  • ¥20 Vite 打包的 Vue3 组件库,图标无法显示
  • ¥15 php 同步电商平台多个店铺增量订单和订单状态
  • ¥15 关于logstash转发日志时发生的部分内容丢失问题
  • ¥17 pro*C预编译“闪回查询”报错SCN不能识别
  • ¥15 微信会员卡接入微信支付商户号收款
  • ¥15 如何获取烟草零售终端数据
  • ¥15 数学建模招标中位数问题