问题遇到的现象和发生背景
本人正在练习项目——对银行还款进行测试,其中的预测结果已经出来,但是想把结果进行可视化的时候遇到了问题,经过一系列排查,应该是 plt.contourf里面的那个ravel()部分出了问题
(请直接跳转到代码最底下部分的:Visulising the Traning Set Result)
问题相关代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
# 经典而分类问题
app_train = pd.read_csv('/Users/iven/Desktop/Python机器学习实战/第十一章:银行客户还款可能性预测/application_train.csv')
# 展示缺失值
def missing_value_table(df):
mis_val = df.isnull().sum() # 计算所有缺失值
mis_val_percent = 100 * df.isnull().sum() / len(df) # %比
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# 做成表格显示出来
mis_val_rename = mis_val_table.rename(columns={0:'Missing valyes',1:'% of total values'})
# 剔除完整的并排序(sort_value) 升序
mis_val_rename = mis_val_rename[mis_val_rename.iloc[:, 1] != 0].sort_values('% of total values', ascending = False)
return mis_val_rename
missing_value_table(app_train)[:10]
'''
axis=0代表往跨行(down),而axis=1代表跨列(across)
使用0值表示沿着每一列或行标签/索引值向下执行方法
使用1值表示沿着每一行或者列标签横向执行对应的方法
'''
# Object 类型数据处理.
# 特征个数大于2时候,一般用One-Hot去处理,而小于2的时候 label-eco
app_train.dtypes.value_counts()
app_train.select_dtypes('object').apply(pd.Series.nunique,axis=0) # Pandas nunique() 用于获取唯一值的统计次数。
le = LabelEncoder()
for col in app_train:
if app_train[col].dtype == 'object':
if len(list(app_train[col].unique()))<=2:
le.fit(app_train[col])
app_train[col]=le.transform(app_train[col])
app_train = pd.get_dummies(app_train)
app_train.shape
# EDA分析 特征分析
train_labels = app_train['TARGET']
app_train['DAYS_BIRTH'][:5] # 贷款的人从出生到现在"活"了多少天
# 因此,我们需要转换成年
(app_train['DAYS_BIRTH']/-365).describe()
(app_train['DAYS_EMPLOYED']).describe()
app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()
app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243
app_train['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()
correlations = app_train.corr()['TARGET'].sort_values()
correlations.head()
correlations.tail()
# 但是对于年龄,它是负数
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['TARGET'].corr(app_train['DAYS_BIRTH'])
# 变负了
plt.figure(figsize = (12,6))
plt.style.use('fivethirtyeight') # 图表风格 去SeaBorn看就ok
plt.hist(app_train['DAYS_BIRTH']/365,edgecolor='k',bins=25)
plt.show()
plt.figure(figsize=(16,8))
#KDEPLOT
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')
plt.show()
# 能用KDEPLot显示就先用,因为这是连续的,更加直观
# 不还钱的人都是30岁左右的人
age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11)) # 设置年龄区间
age_data.head()
age_groups = age_data.groupby('YEARS_BINNED').mean()
plt.figure(figsize=(16,16))
plt.bar(age_groups.index.astype(str),100*age_groups['TARGET'])
plt.xticks(rotation=30) # 坐标轴,多少度
plt.show()
ext_data = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
plt.figure(figsize=(20,8))
sns.heatmap(ext_data_corrs,cmap = plt.cm.RdYlBu_r, linewidths = .5, annot=True)
plt.show()
# http://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.figure(figsize=(16,10))
for i,source in enumerate(['EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1']):
# 指定好子图的位置
plt.subplot(3,1,i+1) # 3行1列,位置i=0 i+1
# kdeplot
sns.kdeplot(app_train.loc[app_train['TARGET']==0,source]/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,source]/365,label='target==1')
plt.title('D of %s' % source)
plt.tight_layout(h_pad=2.5) # 布局 间隙
plt.show()
# 特征工程(多项式回归) X次方越大,越准确
poly_features = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer
# 特征工程之前 - 缺失值填充
imputer = SimpleImputer(strategy='median') # 类似于拟合器,遇到缺失值就用中位数来填补
poly_target = poly_features['TARGET']
poly_features.drop(columns=['TARGET'],inplace =True) # 除了Target列的其他
poly_features = imputer.fit_transform(poly_features) # 拟合
poly_transformer = PolynomialFeatures(degree=3)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
# poly_features.shape 从4个特征变成了35个
poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:20]
# 将当前得到的部分特征跟总体组合在一起
poly_features = pd.DataFrame(
poly_features,
columns = poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])
)
# 与之前100个指标组合在一起
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR'] # ID是不会改变的 根据ID去传进去
app_train_poly = app_train.merge(poly_features, on='SK_ID_CURR', how='left')
# 根据实际情况来创建特征
# 例如对时间特征,可以分出来什么特征呢? 数据挖掘——90%时间都在和小特征打交道,不要忽视任何一个小的特征
# 建模其实没啥花时间
app_train_domain = app_train.copy() # 不要乱改,防止改乱 类似于创建副本
app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL'] # 信用额度与工资比值
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL'] # 没(每年)还款年金和工资的比值
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT'] # 还款总月份
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH'] # 上班时间和年龄的比值
# 这是加到了最初的表,和上面的特征工程没啥关系,这是DIY的,所以新的列数为248而不是279
plt.figure(figsize=(16, 20))
for i, feature in enumerate(
['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
plt.subplot(4, 1, i + 1) # 定义子图的位置数量等
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label='target == 0')
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label='target == 1')
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature);
plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)
plt.show()
'''
pad:调整边框边距
w_pad:调整横宽边距
h_pad:调整纵宽边距
'''
# 数据预处理:特征好了后,检查下整合没啥问题就建模了
Y = app_train['TARGET']
X = app_train.drop(columns = ['TARGET'])
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
imputer = SimpleImputer(strategy='median')
std = StandardScaler()
# 填充
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
# 标准化
std.fit(X_train)
X_train = std.transform(X_train)
X_test = std.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,Y_train)
# 用混淆矩阵
predictions = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, predictions)
# confusion_metrix = 70787/70787+6091 = 92%
# roc_auc_score
predictions_2 = classifier.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(Y_test,predictions_2)
# test_auc = 0.7434
# Visualising the Trainning ser results
from matplotlib.colors import ListedColormap # 给不同的点上不同的颜色
X_set, Y_set = X_train, Y_train
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, # -1 / +1 都能更方便我们看生成好的图像
stop = X_set[:,0].max()+1,
step = 0.01), # 0.01 看显示屏的参数来设定
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier.predict(
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)): # 画出实际存在的点
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Training Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, Y_set = X_test, Y_test
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,
stop = X_set[:,0].max()+1,
step = 0.01),
np.arange(start = X_set[:,1].min()-1,
stop = X_set[:,1].max()+1,
step = 0.01))
plt.contourf(
x1, x2, classifier.predict(
np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
alpha = 0.75,
cmap = ListedColormap(('red', 'green'))
)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)):
plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Test Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
运行结果及报错内容
Traceback (most recent call last):
File "<input>", line 210, in <module>
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 447, in predict
scores = self.decision_function(X)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 429, in decision_function
X = self._validate_data(X, accept_sparse="csr", reset=False)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 600, in _validate_data
self._check_n_features(X, reset=reset)
File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 400, in _check_n_features
raise ValueError(
ValueError: X has 2 features, but LogisticRegression is expecting 243 features as input.
我的解答思路和尝试过的方法:
大概只看到这个比较类似,但不知道怎么修改自己的代码:
https://blog.csdn.net/qq_45128278/article/details/120609776
我想要达到的结果
但是同样的代码我取另一份没那么多维度的数据集来操作的时候,就能成功画出这幅图: