qq_36392286 2021-06-21 20:51 采纳率: 50%
浏览 66
已采纳

为什么LogisticRegression.fit输入2D数据无法强转1D

  

 

报错信息: 

ValueError: Expected 2D array, got 1D array instead

 

# 信用卡交易数据异常检测问题
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

plt.ion()

data = pd.read_csv("../data/creditcard.csv")
print(data.head())
count_classes = pd.value_counts(data["Class"], sort=True).sort_index()
print("------------------------------------------------------------------")
print(count_classes)  # 正样本0 284315  |  负样本1 492
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

'''
    样本数据不均衡,即一个数据极大,而另一个数据极小
    下采样策略,使两种样本同样少
    过采样策略,使两种样本同样多
'''

# 对Amount均值归一化
data["normAmount"] = StandardScaler().fit_transform(data["Amount"].values.reshape(-1, 1))
data = data.drop(["Time", "Amount"], axis=1)
print("------------------------------------------------------------------")
print(data.head())
print("------------------------------------------------------------------")
x = data.iloc[:, data.columns != "Class"]
y = data.iloc[:, data.columns == "Class"]
# 负样本
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# 正样本
normal_indices = data[data.Class == 0].index

# 随机选取x,采取下采样策略,选取和异常样本数相等的正常样本数
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)

# 连接样本合并
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices, :]

x_undersample = under_sample_data.iloc[:, under_sample_data.columns != "Class"]
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == "Class"]

print("Percentage of normal transactions: ",
      len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("Percentage of fraud transactions: ",
      len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

# test_size=0.3即30%的数据做测试集,70%的数据做训练集 random_state=0每次随机效果相同
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
print("------------------------------------------------------------------")
print("Number transactions train dataset: ", len(x_train))
print("Number of transactions test dataset: ", len(x_test))
print("Total number of transactions: ", len(x_train) + len(x_test))
print("------------------------------------------------------------------")

x_train_undersample, x_test_undersample, y_train_undersample, y_test_undertrainsample = train_test_split(x_undersample,
                                                                                                         y_undersample,
                                                                                                         test_size=0.3,
                                                                                                         random_state=0)
print("Number transactions train dataset: ", len(x_train_undersample))
print("Number of transactions test dataset: ", len(x_test_undersample))
print("Total number of transactions: ", len(x_train_undersample) + len(x_test_undersample))
print("------------------------------------------------------------------")

# 召回率Recall = TP/(FN+TP)
from sklearn.linear_model import LogisticRegression
# KFold——做几倍的交叉验证——即将原始数据集切分数据集,cross_val_score交叉验证评估结果
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report

# 逻辑回归模型

def printing_Kfold_scores(x_train_data, y_train_data):
    fold = KFold(5, shuffle=False)  # 切分成五份数据
    c_param_range = [0.01, 0.1, 1, 10, 100]  # 惩罚

    results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=["C_parameter", "Mean recall score"])
    results_table["C_parameter"] = c_param_range

    j = 0
    for c_param in c_param_range:
        print("------------------------------------------------------------------")
        print("C_parameter: ", c_param)
        print("------------------------------------------------------------------")
        print("")

        recall_accs = []
        for iteration, indices in fold.split(y_train_data):
            # iteration训练集,indices测试集
            # 使用逻辑回归模型,C参数表示惩罚项力度,penalty可以选l1或l2惩罚,l1为绝对值惩罚,l2为平方惩罚
            lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')
            print("test--------------------------------------------------------------")
            print(x_train_data.iloc[indices[0], :].values)
            print(y_train_data.iloc[indices[0], :].values.ravel())
            print("------------------------------------------------------------------")
            # 最好参数重新在训练数据上训练模型
            lr.fit(x_train_data.iloc[indices[0], :].values, y_train_data.iloc[indices[0], :].values.ravel())
            # 107行存在问题
            # ValueError: Expected 2D array, got 1D array instead:
            # array=[-1.86375555  3.44264398 -4.46825973  2.80533626 -2.11841248 -2.33228489
            # -4.2612372   1.70168184 -1.43939588 -6.99990663  6.31620968 -8.670818
            # 0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
            # 0.36481048  0.360924    0.66792657 -0.51624236 -0.01221781  0.0706137
            # 0.05850447  0.30488284  0.41801247  0.20885828 -0.34923131]. x_train_data.iloc[indices[0], :]的数据
            # Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
            
            # 建立好模型后,预测模型结果,这里用的就是验证集,索引为1
            y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)

            # 计算召回率
            recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)
            recall_accs.append(recall_acc)
            print("Iteration ", iteration, " : recall score = ", recall_acc)

        results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs)
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']

    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')

    return best_c


best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)
   Time        V1        V2        V3  ...       V27       V28  Amount  Class
0   0.0 -1.359807 -0.072781  2.536347  ...  0.133558 -0.021053  149.62      0
1   0.0  1.191857  0.266151  0.166480  ... -0.008983  0.014724    2.69      0
2   1.0 -1.358354 -1.340163  1.773209  ... -0.055353 -0.059752  378.66      0
3   1.0 -0.966272 -0.185226  1.792993  ...  0.062723  0.061458  123.50      0
4   2.0 -1.158233  0.877737  1.548718  ...  0.219422  0.215153   69.99      0

[5 rows x 31 columns]
------------------------------------------------------------------
0    284315
1       492
Name: Class, dtype: int64
------------------------------------------------------------------
         V1        V2        V3  ...       V28  Class  normAmount
0 -1.359807 -0.072781  2.536347  ... -0.021053      0    0.244964
1  1.191857  0.266151  0.166480  ...  0.014724      0   -0.342475
2 -1.358354 -1.340163  1.773209  ... -0.059752      0    1.160686
3 -0.966272 -0.185226  1.792993  ...  0.061458      0    0.140534
4 -1.158233  0.877737  1.548718  ...  0.215153      0   -0.073403

[5 rows x 30 columns]
------------------------------------------------------------------
Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  984
------------------------------------------------------------------
Number transactions train dataset:  199364
Number of transactions test dataset:  85443
Total number of transactions:  284807
------------------------------------------------------------------
Number transactions train dataset:  688
Number of transactions test dataset:  296
Total number of transactions:  984
------------------------------------------------------------------
------------------------------------------------------------------
C_parameter:  0.01
------------------------------------------------------------------

test--------------------------------------------------------------
[-1.86375555  3.44264398 -4.46825973  2.80533626 -2.11841248 -2.33228489
 -4.2612372   1.70168184 -1.43939588 -6.99990663  6.31620968 -8.670818
  0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
  0.36481048  0.360924    0.66792657 -0.51624236 -0.01221781  0.0706137
  0.05850447  0.30488284  0.41801247  0.20885828 -0.34923131]
[1]
------------------------------------------------------------------
Traceback (most recent call last):
  File "F:/code/deeplearning/DeepLearning/logistic_regression/learning12_transaction_data_anomaly_detection.py", line 140, in <module>
    best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)
  File "F:/code/deeplearning/DeepLearning/logistic_regression/learning12_transaction_data_anomaly_detection.py", line 107, in printing_Kfold_scores
    lr.fit(x_train_data.iloc[indices[0], :].values, y_train_data.iloc[indices[0], :].values.ravel())
  File "C:\Users\lhw\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1346, in fit
    accept_large_sparse=solver != 'liblinear')
  File "C:\Users\lhw\python\lib\site-packages\sklearn\base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 878, in check_X_y
    estimator=estimator)
  File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 698, in check_array
    "if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[-1.86375555  3.44264398 -4.46825973  2.80533626 -2.11841248 -2.33228489
 -4.2612372   1.70168184 -1.43939588 -6.99990663  6.31620968 -8.670818
  0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
  0.36481048  0.360924    0.66792657 -0.51624236 -0.01221781  0.0706137
  0.05850447  0.30488284  0.41801247  0.20885828 -0.34923131].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

Process finished with exit code 1

 打印x_train_data.iloc[indices[0], :].values显示如下

print(x_train_data.iloc[indices[0], :].values)
print(y_train_data.iloc[indices[0], :].values.ravel())
print(type(x_train_data.iloc[indices[0], :].values))
print(x_train_data.iloc[indices[0], :].values.shape)

[-1.86375555  3.44264398 -4.46825973  2.80533626 -2.11841248 -2.33228489
 -4.2612372   1.70168184 -1.43939588 -6.99990663  6.31620968 -8.670818
  0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
  0.36481048  0.360924    0.66792657 -0.51624236 -0.01221781  0.0706137
  0.05850447  0.30488284  0.41801247  0.20885828 -0.34923131]
[1]
<class 'numpy.ndarray'>
(29,)

照理是符合参数要求的,但是一直报错,想不明白为什么,萌新求助!

 

感谢社区大佬指点!使我明白了我错在哪里

 

终于修改完毕

事实上在这一步就有错:(一直没发现copy的时候copy错了,真是糊涂妈妈给糊涂开门,糊涂到家了(* ̄︿ ̄))

for iteration, indices in fold.split(y_train_data)

应该是

for iteration, indices in fold.split(x_train_data)

然后对于fit函数参数也需要进行修改,此前一直没理解fit,在看过这篇文章后我明白了新旧方法间的差异:

https://stackoverflow.com/questions/48641290/typeerror-kfold-object-is-not-iterable

以下是修改后的代码

# 信用卡交易数据异常检测问题
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

plt.ion()

data = pd.read_csv("../data/creditcard.csv")
print(data.head())
count_classes = pd.value_counts(data["Class"], sort=True).sort_index()
print("------------------------------------------------------------------")
print(count_classes)  # 正样本0 284315  |  负样本1 492
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()

'''
    样本数据不均衡,即一个数据极大,而另一个数据极小
    下采样策略,使两种样本同样少
    过采样策略,使两种样本同样多
'''

# 对Amount均值归一化
data["normAmount"] = StandardScaler().fit_transform(data["Amount"].values.reshape(-1, 1))
data = data.drop(["Time", "Amount"], axis=1)
print("------------------------------------------------------------------")
print(data.head())
print("------------------------------------------------------------------")
x = data.iloc[:, data.columns != "Class"]
y = data.iloc[:, data.columns == "Class"]
# 负样本
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# 正样本
normal_indices = data[data.Class == 0].index

# 随机选取x,采取下采样策略,选取和异常样本数相等的正常样本数
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)

# 连接样本合并
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices, :]

x_undersample = under_sample_data.iloc[:, under_sample_data.columns != "Class"]
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == "Class"]

print("Percentage of normal transactions: ",
      len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("Percentage of fraud transactions: ",
      len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))

# test_size=0.3即30%的数据做测试集,70%的数据做训练集 random_state=0每次随机效果相同
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
print("------------------------------------------------------------------")
print("Number transactions train dataset: ", len(x_train))
print("Number of transactions test dataset: ", len(x_test))
print("Total number of transactions: ", len(x_train) + len(x_test))
print("------------------------------------------------------------------")

x_train_undersample, x_test_undersample, y_train_undersample, y_test_undertrainsample = train_test_split(x_undersample,
                                                                                                         y_undersample,
                                                                                                         test_size=0.3,
                                                                                                         random_state=0)
print("Number transactions train dataset: ", len(x_train_undersample))
print("Number of transactions test dataset: ", len(x_test_undersample))
print("Total number of transactions: ", len(x_train_undersample) + len(x_test_undersample))
print("------------------------------------------------------------------")

# 召回率Recall = TP/(FN+TP)
from sklearn.linear_model import LogisticRegression
# KFold——做几倍的交叉验证——即将原始数据集切分数据集,cross_val_score交叉验证评估结果
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report

# 逻辑回归模型


def printing_Kfold_scores(x_train_data, y_train_data):
    fold = KFold(5, shuffle=False)  # 切分成五份数据
    c_param_range = [0.01, 0.1, 1, 10, 100]  # 惩罚

    results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=["C_parameter", "Mean recall score"])
    results_table["C_parameter"] = c_param_range

    j = 0
    for c_param in c_param_range:
        print("------------------------------------------------------------------")
        print("C_parameter: ", c_param)
        print("------------------------------------------------------------------")
        print("")

        recall_accs = []
        for iteration, indices in fold.split(x_train_data):
            # iteration训练集,indices测试集
            # 使用逻辑回归模型,C参数表示惩罚项力度,penalty可以选l1或l2惩罚,l1为绝对值惩罚,l2为平方惩罚
            lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')
            y_shape_num = y_train_data.iloc[iteration, :].values.ravel().shape
            print(y_shape_num[0])
            # 最好参数重新在训练数据上训练模型
            lr.fit(x_train_data.iloc[iteration, :].values.reshape(y_shape_num[0],-1), y_train_data.iloc[iteration, :].values.ravel())

            # 建立好模型后,预测模型结果,这里用的就是验证集,索引为1
            y_pred_undersample = lr.predict(x_train_data.iloc[indices, :])

            # 计算召回率
            recall_acc = recall_score(y_train_data.iloc[indices, :].values, y_pred_undersample)
            recall_accs.append(recall_acc)
            print("Iteration ", " : recall score = ", recall_acc)

        # print(type(results_table.ix[j]))
        # print(np.mean(recall_accs),type(np.mean(recall_accs)))
        results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)

        print()
        j += 1
        print('')
        print('Mean recall score ', np.mean(recall_accs))
        print('')

    print("+++++++++++++++++++++++++++++++ recall score list +++++++++++++++++++++++++++++++")
    print(results_table)
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    best_c = results_table.loc[results_table['Mean recall score'].astype(float).idxmax()]['C_parameter']

    print('*********************************************************************************')
    print('Best model to choose from cross validation is with C parameter = ', best_c)
    print('*********************************************************************************')

    return best_c


best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)
  • 写回答

3条回答 默认 最新

  • 关注

    在新版的sklearn中,所有的数据都应该是二维矩阵,哪怕它只是单独一行或一列(比如前面做预测时,仅仅只用了一个样本数据),所以需要使用.reshape(1,-1)进行转换

    具体可以参考:使用sklearn报错ValueError: Expected 2D array, got 1D array instead - 简书 (jianshu.com)

    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论
查看更多回答(2条)

报告相同问题?

悬赏问题

  • ¥15 树莓派与pix飞控通信
  • ¥15 自动转发微信群信息到另外一个微信群
  • ¥15 outlook无法配置成功
  • ¥30 这是哪个作者做的宝宝起名网站
  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题