随机森林等模型,简单的从sklearn库调用的,同样的参数,数据集,某一天开始结果变得很差,找了很久了,完全找不到原因。
def Preprocess(CSV1, CSV2, shuffle=True, SDAE=False, MMN=False, Smote=False):
df1 = pd.read_csv(CSV1)
# 第一个数据集
# 除了最后一列的数据
X_train = df1.iloc[:, :-1]
# 读取最后一列的数据
y_train = df1.iloc[:, -1]
# 第二个数据集
df2 = pd.read_csv(CSV2)
# 除了最后一列的数据
X_test = df2.iloc[:, :-1]
# 读取最后一列的数据
y_test = df2.iloc[:, -1]
if Smote:
smo = SMOTE(sampling_strategy='auto', random_state=10)
X_train, y_train = smo.fit_resample(X_train, y_train)
if MMN:
X_train = preprocessing.minmax_scale(X_train, feature_range=(0, 1), axis=0, copy=True) # 直接用标准化函数
X_test = preprocessing.minmax_scale(X_test, feature_range=(0, 1), axis=0, copy=True) # 直接用标准化函数
if SDAE:
X_train, X_test = score.SDAE(X_train, X_test)
return X_train, X_test, y_train, y_test
def metric_standards(y_test, y_predict, y_0=None, cal_weight=None):
# 无权重
accuracy = metrics.accuracy_score(y_test, y_predict) # 预测准确率输出
precision = metrics.precision_score(y_test, y_predict, zero_division="warn") # 预测宏平均精确率输出
recall = metrics.recall_score(y_test, y_predict) # 预测宏平均召回率输出
f1_scroe = metrics.f1_score(y_test, y_predict) # 预测平均f1-score输出
if y_0 is not None:
false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, y_0)
# 计算AUC值
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)
y_pred_class = true_positive_rate > thresholds
else:
roc_auc = 0
return accuracy, precision, recall, f1_scroe, roc_auc
def ranforest(X_train, X_test, y_train, y_test, n_estimators=100, random_state=66, n_jobs=-1):
cls = RandomForestClassifier(n_estimators=96, max_depth=17, min_samples_split=43, min_samples_leaf=5, n_jobs=n_jobs)
# SeleFea(cls, X_train, y_train)
cls.fit(X_train, y_train)
y_pre_proba = cls.predict_proba(X_test)
y_predict = cls.predict(X_test)
y_0 = list(y_pre_proba[:, 1])
print('n_estimators = {}, random_state = {}'.format(n_estimators, random_state))
accuracy, precision, recall, f1_scroe, roc_auc = metric_standards(y_test, y_predict, y_0)
return accuracy, precision, recall, f1_scroe, roc_auc