颜值 > 实力 2022-07-29 13:43 采纳率: 50%
浏览 525
已结题

X has 2 features per sample;

问题遇到的现象和发生背景

本人正在练习项目——对银行还款进行测试,其中的预测结果已经出来,但是想把结果进行可视化的时候遇到了问题,经过一系列排查,应该是 plt.contourf里面的那个ravel()部分出了问题
(请直接跳转到代码最底下部分的:Visulising the Traning Set Result)

问题相关代码
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

# 经典而分类问题
app_train = pd.read_csv('/Users/iven/Desktop/Python机器学习实战/第十一章:银行客户还款可能性预测/application_train.csv')

# 展示缺失值
def missing_value_table(df):
    mis_val = df.isnull().sum() # 计算所有缺失值
    mis_val_percent = 100 * df.isnull().sum() / len(df) # %比
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    # 做成表格显示出来
    mis_val_rename = mis_val_table.rename(columns={0:'Missing valyes',1:'% of total values'})
    # 剔除完整的并排序(sort_value) 升序
    mis_val_rename = mis_val_rename[mis_val_rename.iloc[:, 1] != 0].sort_values('% of total values', ascending = False)
    return mis_val_rename
missing_value_table(app_train)[:10]
'''
axis=0代表往跨行(down),而axis=1代表跨列(across)
使用0值表示沿着每一列或行标签/索引值向下执行方法
使用1值表示沿着每一行或者列标签横向执行对应的方法
'''

# Object 类型数据处理.
# 特征个数大于2时候,一般用One-Hot去处理,而小于2的时候 label-eco
app_train.dtypes.value_counts()
app_train.select_dtypes('object').apply(pd.Series.nunique,axis=0) #  Pandas nunique() 用于获取唯一值的统计次数。
le = LabelEncoder()
for col in app_train:
    if app_train[col].dtype == 'object':
        if len(list(app_train[col].unique()))<=2:
            le.fit(app_train[col])
            app_train[col]=le.transform(app_train[col])
app_train = pd.get_dummies(app_train)
app_train.shape

# EDA分析 特征分析
train_labels = app_train['TARGET']
app_train['DAYS_BIRTH'][:5] # 贷款的人从出生到现在"活"了多少天
# 因此,我们需要转换成年
(app_train['DAYS_BIRTH']/-365).describe()
(app_train['DAYS_EMPLOYED']).describe()

app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()

app_train['DAYS_EMPLOYED_ANOM'] = app_train['DAYS_EMPLOYED'] == 365243
app_train['DAYS_EMPLOYED'].replace({365243:np.nan},inplace=True)
app_train['DAYS_EMPLOYED'].plot.hist()
plt.show()

correlations = app_train.corr()['TARGET'].sort_values()
correlations.head()
correlations.tail()
# 但是对于年龄,它是负数
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_train['TARGET'].corr(app_train['DAYS_BIRTH'])
# 变负了
plt.figure(figsize = (12,6))
plt.style.use('fivethirtyeight') # 图表风格 去SeaBorn看就ok
plt.hist(app_train['DAYS_BIRTH']/365,edgecolor='k',bins=25)
plt.show()

plt.figure(figsize=(16,8))
#KDEPLOT
sns.kdeplot(app_train.loc[app_train['TARGET']==0,'DAYS_BIRTH']/365,label='target==0')
sns.kdeplot(app_train.loc[app_train['TARGET']==1,'DAYS_BIRTH']/365,label='target==1')
plt.show()
# 能用KDEPLot显示就先用,因为这是连续的,更加直观
# 不还钱的人都是30岁左右的人

age_data = app_train[['TARGET','DAYS_BIRTH']]
age_data['YEARS_BIRTH'] = age_data['DAYS_BIRTH']/365
age_data['YEARS_BINNED'] = pd.cut(age_data['YEARS_BIRTH'],bins=np.linspace(20,70,num=11)) # 设置年龄区间
age_data.head()

age_groups = age_data.groupby('YEARS_BINNED').mean()

plt.figure(figsize=(16,16))
plt.bar(age_groups.index.astype(str),100*age_groups['TARGET'])
plt.xticks(rotation=30) # 坐标轴,多少度
plt.show()

ext_data = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
ext_data_corrs = ext_data.corr()
plt.figure(figsize=(20,8))
sns.heatmap(ext_data_corrs,cmap = plt.cm.RdYlBu_r, linewidths = .5, annot=True)
plt.show()
# http://seaborn.pydata.org/generated/seaborn.heatmap.html

plt.figure(figsize=(16,10))
for i,source in enumerate(['EXT_SOURCE_3','EXT_SOURCE_2','EXT_SOURCE_1']):
    # 指定好子图的位置
    plt.subplot(3,1,i+1) # 3行1列,位置i=0 i+1
    # kdeplot
    sns.kdeplot(app_train.loc[app_train['TARGET']==0,source]/365,label='target==0')
    sns.kdeplot(app_train.loc[app_train['TARGET']==1,source]/365,label='target==1')
    plt.title('D of %s' % source)
plt.tight_layout(h_pad=2.5) # 布局 间隙
plt.show()

# 特征工程(多项式回归) X次方越大,越准确
poly_features = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

# 特征工程之前 - 缺失值填充
imputer = SimpleImputer(strategy='median') # 类似于拟合器,遇到缺失值就用中位数来填补
poly_target = poly_features['TARGET']
poly_features.drop(columns=['TARGET'],inplace =True) # 除了Target列的其他
poly_features = imputer.fit_transform(poly_features) # 拟合

poly_transformer = PolynomialFeatures(degree=3)
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
# poly_features.shape 从4个特征变成了35个

poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:20]
# 将当前得到的部分特征跟总体组合在一起
poly_features = pd.DataFrame(
    poly_features,
    columns = poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])
)

# 与之前100个指标组合在一起
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR'] # ID是不会改变的 根据ID去传进去
app_train_poly = app_train.merge(poly_features, on='SK_ID_CURR', how='left')

# 根据实际情况来创建特征
# 例如对时间特征,可以分出来什么特征呢? 数据挖掘——90%时间都在和小特征打交道,不要忽视任何一个小的特征
# 建模其实没啥花时间
app_train_domain = app_train.copy() # 不要乱改,防止改乱 类似于创建副本

app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL'] # 信用额度与工资比值
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL'] # 没(每年)还款年金和工资的比值
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT'] # 还款总月份
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH'] # 上班时间和年龄的比值
# 这是加到了最初的表,和上面的特征工程没啥关系,这是DIY的,所以新的列数为248而不是279

plt.figure(figsize=(16, 20))
for i, feature in enumerate(
        ['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
    plt.subplot(4, 1, i + 1) # 定义子图的位置数量等
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label='target == 0')
    sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label='target == 1')

    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature);
    plt.ylabel('Density');

plt.tight_layout(h_pad=2.5)
plt.show()
'''
pad:调整边框边距
w_pad:调整横宽边距
h_pad:调整纵宽边距
'''

# 数据预处理:特征好了后,检查下整合没啥问题就建模了
Y = app_train['TARGET']
X = app_train.drop(columns = ['TARGET'])

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

from sklearn.preprocessing import StandardScaler, MinMaxScaler
imputer = SimpleImputer(strategy='median')
std = StandardScaler()
# 填充
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
# 标准化
std.fit(X_train)
X_train = std.transform(X_train)
X_test = std.transform(X_test)

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train,Y_train)

# 用混淆矩阵
predictions = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, predictions)
# confusion_metrix = 70787/70787+6091 = 92%

# roc_auc_score
predictions_2 = classifier.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(Y_test,predictions_2)
# test_auc = 0.7434

# Visualising the Trainning ser results
from matplotlib.colors import ListedColormap # 给不同的点上不同的颜色
X_set, Y_set = X_train, Y_train
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, # -1 / +1 都能更方便我们看生成好的图像
                               stop = X_set[:,0].max()+1,
                               step = 0.01), # 0.01 看显示屏的参数来设定
                     np.arange(start = X_set[:,1].min()-1,
                               stop = X_set[:,1].max()+1,
                               step = 0.01))
plt.contourf(
    x1, x2, classifier.predict(
        np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
    alpha = 0.75,
    cmap = ListedColormap(('red', 'green'))
    )
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)): # 画出实际存在的点
    plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
                c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Training Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, Y_set = X_test, Y_test
x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,
                               stop = X_set[:,0].max()+1,
                               step = 0.01),
                     np.arange(start = X_set[:,1].min()-1,
                               stop = X_set[:,1].max()+1,
                               step = 0.01))
plt.contourf(
    x1, x2, classifier.predict(
        np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
    alpha = 0.75,
    cmap = ListedColormap(('red', 'green'))
    )
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
for i, j in enumerate(np.unique(Y_set)):
    plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
                c = ListedColormap(('orange','blue'))(i),label=j)
plt.title('Classifier (Test Set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

运行结果及报错内容
Traceback (most recent call last):
  File "<input>", line 210, in <module>
  File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 447, in predict
    scores = self.decision_function(X)
  File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/linear_model/_base.py", line 429, in decision_function
    X = self._validate_data(X, accept_sparse="csr", reset=False)
  File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 600, in _validate_data
    self._check_n_features(X, reset=reset)
  File "/Users/iven/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/sklearn/base.py", line 400, in _check_n_features
    raise ValueError(
ValueError: X has 2 features, but LogisticRegression is expecting 243 features as input.
我的解答思路和尝试过的方法:

大概只看到这个比较类似,但不知道怎么修改自己的代码:
https://blog.csdn.net/qq_45128278/article/details/120609776

我想要达到的结果

但是同样的代码我取另一份没那么多维度的数据集来操作的时候,就能成功画出这幅图:

img

  • 写回答

1条回答 默认 最新

  • herosunly Python领域优质创作者 2022-07-29 16:12
    关注

    plt.contourf绘制的图是基于其中某两个特征的,需要重新构建分类器,并且选择数据集其中的某两个特征,代码以前两个特征为例,即代码中的0: 2,PS:由于代码太长,我就不一一复制了,从195行开始哈:

    predictions_2 = classifier.predict_proba(X_test)[:,1]
    from sklearn.metrics import roc_auc_score
    test_auc = roc_auc_score(Y_test,predictions_2)
    # test_auc = 0.7434
    
    # 为了避免和之前的分类器重复,所以新起了个名字
    classifier_new = LogisticRegression(random_state = 0)
    classifier_new.fit(X_train[:, 0: 2], Y_train) # 0: 2表示的是前两个特征
    
    from matplotlib.colors import ListedColormap # 给不同的点上不同的颜色
    X_set, Y_set = X_train[:, 0: 2], Y_train # 0: 2表示的是前两个特征
    x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1, # -1 / +1 都能更方便我们看生成好的图像
                                   stop = X_set[:,0].max()+1,
                                   step = 0.01), # 0.01 看显示屏的参数来设定
                         np.arange(start = X_set[:,1].min()-1,
                                   stop = X_set[:,1].max()+1,
                                   step = 0.01))
    plt.contourf(
        x1, x2, classifier_new.predict( # 这一行修改了
            np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
        alpha = 0.75,
        cmap = ListedColormap(('red', 'green'))
        )
    plt.xlim(x1.min(), x1.max())
    plt.ylim(x2.min(), x2.max())
    for i, j in enumerate(np.unique(Y_set)): # 画出实际存在的点
        plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
                    c = ListedColormap(('orange','blue'))(i),label=j)
    plt.title('Classifier (Training Set)')
    plt.xlabel('Age')
    plt.ylabel('Estimated Salary')
    plt.legend()
    plt.show()
     
    # Visualising the Test set results
    from matplotlib.colors import ListedColormap
    X_set, Y_set = X_test[:, 0: 2], Y_test # 0:2表示的是前两个特征
    x1, x2 = np.meshgrid(np.arange(start = X_set[:,0].min()-1,
                                   stop = X_set[:,0].max()+1,
                                   step = 0.01),
                         np.arange(start = X_set[:,1].min()-1,
                                   stop = X_set[:,1].max()+1,
                                   step = 0.01))
    plt.contourf(
        x1, x2, classifier_new.predict(
            np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),
        alpha = 0.75,
        cmap = ListedColormap(('red', 'green'))
        )
    plt.xlim(x1.min(), x1.max())
    plt.ylim(x2.min(), x2.max())
    for i, j in enumerate(np.unique(Y_set)):
        plt.scatter(X_set[Y_set == j, 0], X_set[Y_set == j, 1],
                    c = ListedColormap(('orange','blue'))(i),label=j)
    plt.title('Classifier (Test Set)')
    plt.xlabel('Age')
    plt.ylabel('Estimated Salary')
    plt.legend()
    plt.show()
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论

报告相同问题?

问题事件

  • 系统已结题 8月7日
  • 已采纳回答 7月30日
  • 创建了问题 7月29日

悬赏问题

  • ¥15 请分析一下这个电路设计的优点🙏
  • ¥15 求视频摘要youtube和ovp数据集
  • ¥15 怎么改成输入一个要删除的数后现实剩余的数再输入一个删除的数再现实剩余的数用yes表示继续no结束程序
  • ¥15 在启动roslaunch时出现如下问题
  • ¥15 汇编语言实现加减法计算器的功能
  • ¥20 关于多单片机模块化的一些问题
  • ¥30 seata使用出现报错,其他服务找不到seata
  • ¥35 引用csv数据文件(4列1800行),通过高斯-赛德尔法拟合曲线,在选取(每五十点取1点)数据,求该数据点的曲率中心。
  • ¥20 程序只发送0X01,串口助手显示不正确,配置看了没有问题115200-8-1-no,如何解决?
  • ¥15 Google speech command 数据集获取