报错信息:
ValueError: Expected 2D array, got 1D array instead
# 信用卡交易数据异常检测问题
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
plt.ion()
data = pd.read_csv("../data/creditcard.csv")
print(data.head())
count_classes = pd.value_counts(data["Class"], sort=True).sort_index()
print("------------------------------------------------------------------")
print(count_classes) # 正样本0 284315 | 负样本1 492
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()
'''
样本数据不均衡,即一个数据极大,而另一个数据极小
下采样策略,使两种样本同样少
过采样策略,使两种样本同样多
'''
# 对Amount均值归一化
data["normAmount"] = StandardScaler().fit_transform(data["Amount"].values.reshape(-1, 1))
data = data.drop(["Time", "Amount"], axis=1)
print("------------------------------------------------------------------")
print(data.head())
print("------------------------------------------------------------------")
x = data.iloc[:, data.columns != "Class"]
y = data.iloc[:, data.columns == "Class"]
# 负样本
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# 正样本
normal_indices = data[data.Class == 0].index
# 随机选取x,采取下采样策略,选取和异常样本数相等的正常样本数
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
# 连接样本合并
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices, :]
x_undersample = under_sample_data.iloc[:, under_sample_data.columns != "Class"]
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == "Class"]
print("Percentage of normal transactions: ",
len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("Percentage of fraud transactions: ",
len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# test_size=0.3即30%的数据做测试集,70%的数据做训练集 random_state=0每次随机效果相同
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
print("------------------------------------------------------------------")
print("Number transactions train dataset: ", len(x_train))
print("Number of transactions test dataset: ", len(x_test))
print("Total number of transactions: ", len(x_train) + len(x_test))
print("------------------------------------------------------------------")
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undertrainsample = train_test_split(x_undersample,
y_undersample,
test_size=0.3,
random_state=0)
print("Number transactions train dataset: ", len(x_train_undersample))
print("Number of transactions test dataset: ", len(x_test_undersample))
print("Total number of transactions: ", len(x_train_undersample) + len(x_test_undersample))
print("------------------------------------------------------------------")
# 召回率Recall = TP/(FN+TP)
from sklearn.linear_model import LogisticRegression
# KFold——做几倍的交叉验证——即将原始数据集切分数据集,cross_val_score交叉验证评估结果
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
# 逻辑回归模型
def printing_Kfold_scores(x_train_data, y_train_data):
fold = KFold(5, shuffle=False) # 切分成五份数据
c_param_range = [0.01, 0.1, 1, 10, 100] # 惩罚
results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=["C_parameter", "Mean recall score"])
results_table["C_parameter"] = c_param_range
j = 0
for c_param in c_param_range:
print("------------------------------------------------------------------")
print("C_parameter: ", c_param)
print("------------------------------------------------------------------")
print("")
recall_accs = []
for iteration, indices in fold.split(y_train_data):
# iteration训练集,indices测试集
# 使用逻辑回归模型,C参数表示惩罚项力度,penalty可以选l1或l2惩罚,l1为绝对值惩罚,l2为平方惩罚
lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')
print("test--------------------------------------------------------------")
print(x_train_data.iloc[indices[0], :].values)
print(y_train_data.iloc[indices[0], :].values.ravel())
print("------------------------------------------------------------------")
# 最好参数重新在训练数据上训练模型
lr.fit(x_train_data.iloc[indices[0], :].values, y_train_data.iloc[indices[0], :].values.ravel())
# 107行存在问题
# ValueError: Expected 2D array, got 1D array instead:
# array=[-1.86375555 3.44264398 -4.46825973 2.80533626 -2.11841248 -2.33228489
# -4.2612372 1.70168184 -1.43939588 -6.99990663 6.31620968 -8.670818
# 0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
# 0.36481048 0.360924 0.66792657 -0.51624236 -0.01221781 0.0706137
# 0.05850447 0.30488284 0.41801247 0.20885828 -0.34923131]. x_train_data.iloc[indices[0], :]的数据
# Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
# 建立好模型后,预测模型结果,这里用的就是验证集,索引为1
y_pred_undersample = lr.predict(x_train_data.iloc[indices[1], :].values)
# 计算召回率
recall_acc = recall_score(y_train_data.iloc[indices[1], :].values, y_pred_undersample)
recall_accs.append(recall_acc)
print("Iteration ", iteration, " : recall score = ", recall_acc)
results_table.ix[j, 'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)
Time V1 V2 V3 ... V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 ... 0.133558 -0.021053 149.62 0
1 0.0 1.191857 0.266151 0.166480 ... -0.008983 0.014724 2.69 0
2 1.0 -1.358354 -1.340163 1.773209 ... -0.055353 -0.059752 378.66 0
3 1.0 -0.966272 -0.185226 1.792993 ... 0.062723 0.061458 123.50 0
4 2.0 -1.158233 0.877737 1.548718 ... 0.219422 0.215153 69.99 0
[5 rows x 31 columns]
------------------------------------------------------------------
0 284315
1 492
Name: Class, dtype: int64
------------------------------------------------------------------
V1 V2 V3 ... V28 Class normAmount
0 -1.359807 -0.072781 2.536347 ... -0.021053 0 0.244964
1 1.191857 0.266151 0.166480 ... 0.014724 0 -0.342475
2 -1.358354 -1.340163 1.773209 ... -0.059752 0 1.160686
3 -0.966272 -0.185226 1.792993 ... 0.061458 0 0.140534
4 -1.158233 0.877737 1.548718 ... 0.215153 0 -0.073403
[5 rows x 30 columns]
------------------------------------------------------------------
Percentage of normal transactions: 0.5
Percentage of fraud transactions: 0.5
Total number of transactions in resampled data: 984
------------------------------------------------------------------
Number transactions train dataset: 199364
Number of transactions test dataset: 85443
Total number of transactions: 284807
------------------------------------------------------------------
Number transactions train dataset: 688
Number of transactions test dataset: 296
Total number of transactions: 984
------------------------------------------------------------------
------------------------------------------------------------------
C_parameter: 0.01
------------------------------------------------------------------
test--------------------------------------------------------------
[-1.86375555 3.44264398 -4.46825973 2.80533626 -2.11841248 -2.33228489
-4.2612372 1.70168184 -1.43939588 -6.99990663 6.31620968 -8.670818
0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
0.36481048 0.360924 0.66792657 -0.51624236 -0.01221781 0.0706137
0.05850447 0.30488284 0.41801247 0.20885828 -0.34923131]
[1]
------------------------------------------------------------------
Traceback (most recent call last):
File "F:/code/deeplearning/DeepLearning/logistic_regression/learning12_transaction_data_anomaly_detection.py", line 140, in <module>
best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)
File "F:/code/deeplearning/DeepLearning/logistic_regression/learning12_transaction_data_anomaly_detection.py", line 107, in printing_Kfold_scores
lr.fit(x_train_data.iloc[indices[0], :].values, y_train_data.iloc[indices[0], :].values.ravel())
File "C:\Users\lhw\python\lib\site-packages\sklearn\linear_model\_logistic.py", line 1346, in fit
accept_large_sparse=solver != 'liblinear')
File "C:\Users\lhw\python\lib\site-packages\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 878, in check_X_y
estimator=estimator)
File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\lhw\python\lib\site-packages\sklearn\utils\validation.py", line 698, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[-1.86375555 3.44264398 -4.46825973 2.80533626 -2.11841248 -2.33228489
-4.2612372 1.70168184 -1.43939588 -6.99990663 6.31620968 -8.670818
0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
0.36481048 0.360924 0.66792657 -0.51624236 -0.01221781 0.0706137
0.05850447 0.30488284 0.41801247 0.20885828 -0.34923131].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
Process finished with exit code 1
打印x_train_data.iloc[indices[0], :].values显示如下
print(x_train_data.iloc[indices[0], :].values)
print(y_train_data.iloc[indices[0], :].values.ravel())
print(type(x_train_data.iloc[indices[0], :].values))
print(x_train_data.iloc[indices[0], :].values.shape)
[-1.86375555 3.44264398 -4.46825973 2.80533626 -2.11841248 -2.33228489
-4.2612372 1.70168184 -1.43939588 -6.99990663 6.31620968 -8.670818
0.31602399 -7.41771206 -0.43653747 -3.65280196 -6.29314532 -1.24324829
0.36481048 0.360924 0.66792657 -0.51624236 -0.01221781 0.0706137
0.05850447 0.30488284 0.41801247 0.20885828 -0.34923131]
[1]
<class 'numpy.ndarray'>
(29,)
照理是符合参数要求的,但是一直报错,想不明白为什么,萌新求助!
感谢社区大佬指点!使我明白了我错在哪里
终于修改完毕
事实上在这一步就有错:(一直没发现copy的时候copy错了,真是糊涂妈妈给糊涂开门,糊涂到家了(* ̄︿ ̄))
for iteration, indices in fold.split(y_train_data)
应该是
for iteration, indices in fold.split(x_train_data)
然后对于fit函数参数也需要进行修改,此前一直没理解fit,在看过这篇文章后我明白了新旧方法间的差异:
https://stackoverflow.com/questions/48641290/typeerror-kfold-object-is-not-iterable
以下是修改后的代码
# 信用卡交易数据异常检测问题
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
plt.ion()
data = pd.read_csv("../data/creditcard.csv")
print(data.head())
count_classes = pd.value_counts(data["Class"], sort=True).sort_index()
print("------------------------------------------------------------------")
print(count_classes) # 正样本0 284315 | 负样本1 492
count_classes.plot(kind="bar")
plt.title("Fraud class histogram")
plt.xlabel("Class")
plt.ylabel("Frequency")
plt.show()
'''
样本数据不均衡,即一个数据极大,而另一个数据极小
下采样策略,使两种样本同样少
过采样策略,使两种样本同样多
'''
# 对Amount均值归一化
data["normAmount"] = StandardScaler().fit_transform(data["Amount"].values.reshape(-1, 1))
data = data.drop(["Time", "Amount"], axis=1)
print("------------------------------------------------------------------")
print(data.head())
print("------------------------------------------------------------------")
x = data.iloc[:, data.columns != "Class"]
y = data.iloc[:, data.columns == "Class"]
# 负样本
number_records_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# 正样本
normal_indices = data[data.Class == 0].index
# 随机选取x,采取下采样策略,选取和异常样本数相等的正常样本数
random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
# 连接样本合并
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
under_sample_data = data.iloc[under_sample_indices, :]
x_undersample = under_sample_data.iloc[:, under_sample_data.columns != "Class"]
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == "Class"]
print("Percentage of normal transactions: ",
len(under_sample_data[under_sample_data.Class == 0]) / len(under_sample_data))
print("Percentage of fraud transactions: ",
len(under_sample_data[under_sample_data.Class == 1]) / len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
# test_size=0.3即30%的数据做测试集,70%的数据做训练集 random_state=0每次随机效果相同
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
print("------------------------------------------------------------------")
print("Number transactions train dataset: ", len(x_train))
print("Number of transactions test dataset: ", len(x_test))
print("Total number of transactions: ", len(x_train) + len(x_test))
print("------------------------------------------------------------------")
x_train_undersample, x_test_undersample, y_train_undersample, y_test_undertrainsample = train_test_split(x_undersample,
y_undersample,
test_size=0.3,
random_state=0)
print("Number transactions train dataset: ", len(x_train_undersample))
print("Number of transactions test dataset: ", len(x_test_undersample))
print("Total number of transactions: ", len(x_train_undersample) + len(x_test_undersample))
print("------------------------------------------------------------------")
# 召回率Recall = TP/(FN+TP)
from sklearn.linear_model import LogisticRegression
# KFold——做几倍的交叉验证——即将原始数据集切分数据集,cross_val_score交叉验证评估结果
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
# 逻辑回归模型
def printing_Kfold_scores(x_train_data, y_train_data):
fold = KFold(5, shuffle=False) # 切分成五份数据
c_param_range = [0.01, 0.1, 1, 10, 100] # 惩罚
results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=["C_parameter", "Mean recall score"])
results_table["C_parameter"] = c_param_range
j = 0
for c_param in c_param_range:
print("------------------------------------------------------------------")
print("C_parameter: ", c_param)
print("------------------------------------------------------------------")
print("")
recall_accs = []
for iteration, indices in fold.split(x_train_data):
# iteration训练集,indices测试集
# 使用逻辑回归模型,C参数表示惩罚项力度,penalty可以选l1或l2惩罚,l1为绝对值惩罚,l2为平方惩罚
lr = LogisticRegression(C=c_param, penalty='l1', solver='liblinear')
y_shape_num = y_train_data.iloc[iteration, :].values.ravel().shape
print(y_shape_num[0])
# 最好参数重新在训练数据上训练模型
lr.fit(x_train_data.iloc[iteration, :].values.reshape(y_shape_num[0],-1), y_train_data.iloc[iteration, :].values.ravel())
# 建立好模型后,预测模型结果,这里用的就是验证集,索引为1
y_pred_undersample = lr.predict(x_train_data.iloc[indices, :])
# 计算召回率
recall_acc = recall_score(y_train_data.iloc[indices, :].values, y_pred_undersample)
recall_accs.append(recall_acc)
print("Iteration ", " : recall score = ", recall_acc)
# print(type(results_table.ix[j]))
# print(np.mean(recall_accs),type(np.mean(recall_accs)))
results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
print()
j += 1
print('')
print('Mean recall score ', np.mean(recall_accs))
print('')
print("+++++++++++++++++++++++++++++++ recall score list +++++++++++++++++++++++++++++++")
print(results_table)
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
best_c = results_table.loc[results_table['Mean recall score'].astype(float).idxmax()]['C_parameter']
print('*********************************************************************************')
print('Best model to choose from cross validation is with C parameter = ', best_c)
print('*********************************************************************************')
return best_c
best_c = printing_Kfold_scores(x_train_undersample, y_train_undersample)