import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
from sklearn.preprocessing import scale
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
import xgboost as xgb
from xgboost import plot_importance
from sklearn.datasets import load_boston
# 设置输出结果不带省略号
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
np.set_printoptions(threshold=10000)
np.set_printoptions(threshold=10000)
# 加载数据
train_csv = pd.read_csv('train.csv')
test_csv = pd.read_csv('test.csv')
sample_submission_csv = pd.read_csv('sample_submission.csv')
# print(train_csv.head(5))
# 预处理数据
print(train_csv.shape, '\n', test_csv.shape, '\n', sample_submission_csv.shape, '\n')
print(train_csv.isnull().sum(), '\n', test_csv.isnull().sum(), '\n', sample_submission_csv.isnull().sum(), '\n')
print(train_csv.dtypes)
print(train_csv['loss'].value_counts())
# print(train_csv.describe())
'''plt.figure(figsize=(12, 10))
ax = sns.heatmap(train_csv.corr())
fig = plt.figure(figsize=(30, 25))
ax = fig.gca()
train_csv.hist(ax=ax)
plt.show()'''
# 初步选取特征
train_csv_feature = train_csv.drop('id', axis=1)
test_csv_feature = test_csv.drop('id', axis=1)
# print(train_csv_feature.head(5))
# 划分训练集和测试集
train_data = train_csv_feature.drop('loss', axis=1)
test_data = train_csv_feature.iloc[:, train_csv_feature.columns == 'loss']
X_train, X_test, y_train, y_test = train_test_split(train_data, test_data, test_size=0.2)
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
print(X_train.shape, '\n', X_test.shape, '\n', y_train.shape, '\n', y_test.shape)
print(y_train)
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=160, silent=True, objective='reg:gamma')
model.fit(X_train, y_train)
# 对测试集进行预测
ans = model.predict(X_test)
# 显示重要特征
plot_importance(model)
plt.show()
数据: