m0_73677170 2023-08-31 14:39 采纳率: 0%
浏览 3

划分的训练集和训练时的训练集不一样

# 分离特征和目标变量
y = '23h结果'
X = [x for x in data.columns if x not in [y, '23h结果']]
#划分数据
X_train, X_test, y_train, y_test = train_test_split(data[X], data[y], test_size=0.2,
                                                    random_state=1)
# 定义评价指标
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    return mae, mse, rmse, r2


# 定义参数范围
params = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'num_leaves': list(range(10, 81, 10)),
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': list(range(50, 401, 50)),
    'subsample': [0.5, 0.7, 0.9],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]}

# 初始化所有特征的按重要性降序排序列表
feature_importance = X_train.columns.values.tolist()
# 计算所有特征的初始评价指标
params = {
    'objective': 'regression',
    'metric': ['mae', 'mse', 'rmse', 'r2']
}

lgb_train = lgb.Dataset(data[X], data[y])
eval_results = {}

gbm = lgb.train(params,
                lgb_train,
                num_boost_round=50,
                evals_result=eval_results,
                verbose_eval=True,
                early_stopping_rounds=5,
                valid_sets=[lgb_train])

# 初始化特征评价指标列表
mae_list = [mean_absolute_error(y_test, gbm.predict(X_test))]
mse_list = [mean_squared_error(y_test, gbm.predict(X_test))]
rmse_list = [np.sqrt(mse_list[0])]
r2_list = [r2_score(y_test, gbm.predict(X_test))]
feature_importance = pd.DataFrame({'feature': X, 'importance': gbm.feature_importance()})
n_selected_features = len(X)

# 循环递归特征消除
while n_selected_features > 1:
    # 训练模型并在测试集上进行评估
    lgb_train = lgb.Dataset(data[X], data[y])
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=50,
                    evals_result=eval_results,
                    verbose_eval=False,
                    early_stopping_rounds=5,
                    valid_sets=[lgb_train])

    # 更新测试集
    X_test = data[X].loc[y_test.index]
    y_test = data[y].loc[y_test.index]

    # 记录评价指标
    mae_list.append(mean_absolute_error(y_test, gbm.predict(X_test)))
    mse_list.append(mean_squared_error(y_test, gbm.predict(X_test)))
    rmse_list.append(np.sqrt(mse_list[-1]))
    r2_list.append(r2_score(y_test, gbm.predict(X_test)))

    # 输出每次迭代的评价指标
    print(f'n_features = {n_selected_features}, '
          f'MAE = {mae_list[-1]:.4f}, '
          f'MSE = {mse_list[-1]:.4f}, '
          f'RMSE = {rmse_list[-1]:.4f}, '
          f'r2_score = {r2_list[-1]:.4f}')

    # 取出最不重要的特征,并更新X和n_selected_features
    least_important_feature = feature_importance.loc[feature_importance['importance'].idxmin(), 'feature']
    X.remove(least_important_feature)
    n_selected_features -= 1
    feature_importance = feature_importance[feature_importance['feature'] != least_important_feature]
    least_important_features = []

我在进行递归特征消除时,运行如下代码后,在运行中出现了如下信息:[LightGBM] [Info] Number of data points in the train set: 450, number of used features: 18,是代表我用450例数据进行了训练集吗?可是我将450例数据分成了训练集和测试集,理论上不应该时390例数据作为训练集吗?是上述代码有什么问题吗?

  • 写回答

2条回答 默认 最新

  • m0_73677170 2023-08-31 14:49
    关注
    评论

报告相同问题?

问题事件

  • 创建了问题 8月31日