使用机器学习模型对数据按年份(共6年)进行6折交叉验证,但是数据的拟合结果精确度(R2)不是逐年上升的,并且我得不到输出的预测值,也就没办法通过看散点图判断哪里有问题,希望可以指出我哪里出现了问题
from sklearn.model_selection import KFold
#st
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib as plt
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from IPython.display import clear_output as clear
print('Loading data...')
#加载数据
path1 = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\删除异常值.csv"
test_file = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\删除异常值_2015.csv"
txt_path = r"D:\data\daqiyaogan\LightGBM数据文件\单独测试\2015.txt"
#文件开头
content_ls = ['learning_rate', 'max_depth', 'num_leaves', 'RMSE', 'MAE', 'R2', 'RMSE_2015', 'MAE_2015', 'R2_2015']
if os.path.exists(txt_path):
os.remove(txt_path)
with open(txt_path, 'w') as f:
f.write(','.join(content_ls) + '\n')
df = pd.read_csv(path1)
Y = df['pm10_mean'].values
X = df.drop(['time', 'pm2_5', 'site', 'pm10_mean'],axis = 1).values
x_train,x_test,y_train,y_test = model_selection.train_test_split(X,Y,test_size = 0.1,random_state = 33)
test_df = pd.read_csv(test_file)
test_x = test_df.drop(['time', 'pm2_5', 'site', 'pm10_mean'],axis = 1).values
test_y = test_df['pm10_mean'].values
print('Starting training...')
#模型训练
max_depth = 10
num_leaves = 490
learning_rate = 0.1
n_estimators = 850
min_child_samples = 1
min_child_weight = 0.001
feature_fraction = 1
lambda_l1 = 0
drop_rate = 0.1
lambda_l2 = 0
subsample = 1
max_drop = 100
print('正在验证learning_rate为{},max_depth为{},num_leaves为{}的STLightBDT模型'.format(round(learning_rate,3), max_depth, num_leaves))
gbm = lgb.LGBMRegressor(num_leaves = num_leaves,
learning_rate = learning_rate,
max_depth = max_depth,
min_child_samples = min_child_samples,
min_child_weight = min_child_weight,
feature_fraction = feature_fraction,
extra_tree = False,
first_metric_only = True,
drop_rate = drop_rate,
lambda_l1 = lambda_l1,
lambda_l2 = lambda_l2,
subsample = subsample,
max_drop = max_drop,
n_estimators = n_estimators)
gbm.fit(x_train, y_train,
eval_set = [(x_test, y_test)],
eval_metric = 'l1',
early_stopping_rounds = 20)
clear()
print('Starting predicting...')
def calc(real, pred):
rmse = round(sqrt(mean_squared_error(real, pred)), 3)
mae = round(mean_absolute_error(real, pred), 3)
r2 = round(r2_score(real, pred), 3)
return rmse, mae, r2
#模型预测
y_pred = gbm.predict(x_test, num_iteration = gbm.best_iteration_)
rmse, mae, r2 = calc(y_test, y_pred)
print('样本集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2))
#模型评估
# rmse = round(sqrt(mean_squared_error(y_test, y_pred)), 3)
# mae = round(mean_absolute_error(y_test, y_pred), 3)
# r2 = round(r2_score(y_test, y_pred), 3)
# print('样本集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2))
y_pred = gbm.predict(test_x, num_iteration = gbm.best_iteration_)
rmse, mae, r2 = calc(test_y, y_pred)
# rmse = round(sqrt(mean_squared_error(test_y, y_pred)), 3)
# mae = round(mean_absolute_error(test_y, y_pred), 3)
# r2 = round(r2_score(test_y, y_pred), 3)
print('2015测试集中RMSE={}, MAE={}, R2={}'.format(rmse, mae, r2,))
我修改数据,每一年运行一次上面的代码,但是得到的结果(R2)不是逐年上升的,这不应该啊。运行结果如下:
2015测试集中RMSE=21.148, MAE=14.754, R2=0.902
2017测试集中RMSE=18.222, MAE=12.74, R2=0.916
2018测试集中RMSE=18.281, MAE=12.317, R2=0.908
2019测试集中RMSE=16.447, MAE=11.404, R2=0.903
2020测试集中RMSE=15.908, MAE=11.174, R2=0.889