import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 导入必要的库
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
classification_report, confusion_matrix, accuracy_score,
precision_score, recall_score, f1_score, roc_auc_score,
roc_curve, auc
)
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
import pandas as pd
from scipy.stats import pearsonr, chi2_contingency
import logging
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# 设置随机种子确保可重复性
np.random.seed(42)
class NetworkTrafficPreprocessor:
"""网络流量数据预处理器"""
def __init__(self):
self.scaler = StandardScaler()
self.label_encoders = {}
self.imputer = KNNImputer(n_neighbors=5)
self.feature_selector = None
self.pca = None
self.selected_features = None
def load_sample_data(self):
"""生成模拟的网络流量数据用于演示"""
logger.info("生成模拟网络流量数据...")
n_samples = 10000
np.random.seed(42)
# 生成模拟特征
data = {
'duration': np.random.exponential(1.0, n_samples),
'protocol_type': np.random.choice(['tcp', 'udp', 'icmp'], n_samples, p=[0.6, 0.3, 0.1]),
'service': np.random.choice(['http', 'ftp', 'ssh', 'smtp', 'dns'], n_samples),
'flag': np.random.choice(['SF', 'S0', 'REJ', 'RSTO', 'RSTR'], n_samples),
'src_bytes': np.random.lognormal(6, 2, n_samples),
'dst_bytes': np.random.lognormal(5, 2, n_samples),
'land': np.random.choice([0, 1], n_samples, p=[0.99, 0.01]),
'wrong_fragment': np.random.poisson(0.1, n_samples),
'urgent': np.random.poisson(0.01, n_samples),
'hot': np.random.poisson(1, n_samples),
'num_failed_logins': np.random.poisson(0.1, n_samples),
'logged_in': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
'num_compromised': np.random.poisson(0.05, n_samples),
'root_shell': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
'su_attempted': np.random.choice([0, 1], n_samples, p=[0.98, 0.02]),
'num_root': np.random.poisson(0.1, n_samples),
'num_file_creations': np.random.poisson(0.05, n_samples),
'num_shells': np.random.poisson(0.01, n_samples),
'num_access_files': np.random.poisson(0.02, n_samples),
'num_outbound_cmds': np.random.poisson(0.001, n_samples),
'is_host_login': np.random.choice([0, 1], n_samples, p=[0.99, 0.01]),
'is_guest_login': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
'count': np.random.poisson(10, n_samples),
'srv_count': np.random.poisson(5, n_samples),
'serror_rate': np.random.beta(1, 9, n_samples),
'srv_serror_rate': np.random.beta(1, 9, n_samples),
'rerror_rate': np.random.beta(1, 19, n_samples),
'srv_rerror_rate': np.random.beta(1, 19, n_samples),
'same_srv_rate': np.random.beta(9, 1, n_samples),
'diff_srv_rate': np.random.beta(1, 9, n_samples),
'srv_diff_host_rate': np.random.beta(1, 9, n_samples),
'dst_host_count': np.random.poisson(100, n_samples),
'dst_host_srv_count': np.random.poisson(50, n_samples),
'dst_host_same_srv_rate': np.random.beta(9, 1, n_samples),
'dst_host_diff_srv_rate': np.random.beta(1, 9, n_samples),
'dst_host_same_src_port_rate': np.random.beta(9, 1, n_samples),
'dst_host_srv_diff_host_rate': np.random.beta(1, 9, n_samples),
'dst_host_serror_rate': np.random.beta(1, 9, n_samples),
'dst_host_srv_serror_rate': np.random.beta(1, 9, n_samples),
'dst_host_rerror_rate': np.random.beta(1, 19, n_samples),
'dst_host_srv_rerror_rate': np.random.beta(1, 19, n_samples)
}
df = pd.DataFrame(data)
# 生成标签(0=正常,1=异常)
# 基于某些特征组合创建异常样本
anomaly_condition = (
(df['src_bytes'] > df['src_bytes'].quantile(0.99)) |
(df['num_failed_logins'] > 3) |
(df['serror_rate'] > 0.8) |
(df['same_srv_rate'] < 0.1)
)
df['label'] = np.where(anomaly_condition, 1, 0)
# 确保类别平衡
normal_samples = df[df['label'] == 0]
anomaly_samples = df[df['label'] == 1]
if len(anomaly_samples) < len(normal_samples) * 0.2:
# 上采样异常样本
anomaly_upsampled = resample(
anomaly_samples,
replace=True,
n_samples=int(len(normal_samples) * 0.2),
random_state=42
)
df = pd.concat([normal_samples, anomaly_upsampled])
logger.info(f"数据生成完成: {df.shape[0]} 个样本, {df.shape[1]} 个特征")
logger.info(f"类别分布: {Counter(df['label'])}")
return df
def handle_missing_values(self, df, threshold=0.3):
"""处理缺失值"""
logger.info("处理缺失值...")
# 记录缺失值情况
missing_stats = df.isnull().sum()
missing_percent = missing_stats / len(df) * 100
logger.info(f"缺失值统计:\n{pd.DataFrame({'缺失数量': missing_stats, '缺失比例%': missing_percent})[missing_stats > 0]}")
# 删除缺失值比例过高的列
cols_to_drop = missing_percent[missing_percent > threshold * 100].index.tolist()
if cols_to_drop:
logger.info(f"删除缺失值过多的列: {cols_to_drop}")
df = df.drop(columns=cols_to_drop)
# 对剩余缺失值使用KNN插补
if df.isnull().sum().sum() > 0:
logger.info("使用KNN插补处理剩余缺失值...")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if numeric_cols:
imputer = KNNImputer(n_neighbors=5)
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
self.imputer = imputer
return df
def handle_outliers(self, df, method='iqr', threshold=3):
"""处理异常值"""
logger.info("处理异常值...")
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'label' in numeric_cols:
numeric_cols.remove('label')
outliers_count = 0
for col in numeric_cols:
if method == 'iqr':
# IQR方法
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
outliers_count += outliers
# 缩尾处理
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
elif method == 'zscore':
# Z-score方法
mean = df[col].mean()
std = df[col].std()
z_scores = np.abs((df[col] - mean) / std)
outliers = (z_scores > threshold).sum()
outliers_count += outliers
# 替换为边界值
lower_bound = mean - threshold * std
upper_bound = mean + threshold * std
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
logger.info(f"检测到并处理了 {outliers_count} 个异常值")
return df
def encode_categorical(self, df):
"""编码类别特征"""
logger.info("编码类别特征...")
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
if col not in self.label_encoders:
le = LabelEncoder()
df[col] = le.fit_transform(df[col].astype(str))
self.label_encoders[col] = le
logger.info(f"编码类别特征: {col} -> {len(le.classes_)} 个类别")
else:
df[col] = self.label_encoders[col].transform(df[col].astype(str))
return df
def engineer_features(self, df):
"""特征工程"""
logger.info("执行特征工程...")
# 创建流量比率特征
if 'src_bytes' in df.columns and 'dst_bytes' in df.columns:
df['bytes_ratio'] = df['src_bytes'] / (df['dst_bytes'] + 1) # 避免除零
if 'count' in df.columns and 'srv_count' in df.columns:
df['count_ratio'] = df['count'] / (df['srv_count'] + 1)
# 创建交互特征
if 'serror_rate' in df.columns and 'srv_serror_rate' in df.columns:
df['total_error_rate'] = df['serror_rate'] + df['srv_serror_rate']
if 'rerror_rate' in df.columns and 'srv_rerror_rate' in df.columns:
df['total_rerror_rate'] = df['rerror_rate'] + df['srv_rerror_rate']
# 创建统计特征
if 'duration' in df.columns:
df['duration_log'] = np.log1p(df['duration'])
if 'src_bytes' in df.columns:
df['src_bytes_log'] = np.log1p(df['src_bytes'])
# 创建标志位组合特征
flag_cols = [col for col in df.columns if 'flag' in col.lower() or 'urgent' in col.lower()]
if flag_cols:
df['flag_sum'] = df[flag_cols].sum(axis=1)
logger.info(f"特征工程后特征数: {df.shape[1]}")
return df
def select_features(self, X, y, k=20, method='f_classif'):
"""特征选择"""
logger.info(f"使用 {method} 方法选择 {k} 个最佳特征...")
if method == 'f_classif':
selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1]))
elif method == 'mutual_info':
selector = SelectKBest(score_func=mutual_info_classif, k=min(k, X.shape[1]))
else:
selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1]))
X_selected = selector.fit_transform(X, y)
self.feature_selector = selector
self.selected_features = X.columns[selector.get_support()].tolist()
logger.info(f"选择的特征: {self.selected_features}")
return X_selected
def apply_pca(self, X, n_components=0.95):
"""应用PCA降维"""
logger.info(f"应用PCA降维,保留 {n_components * 100}% 方差...")
self.pca = PCA(n_components=n_components, random_state=42)
X_pca = self.pca.fit_transform(X)
logger.info(f"PCA降维后维度: {X_pca.shape[1]} (原始: {X.shape[1]})")
logger.info(f"解释方差比: {self.pca.explained_variance_ratio_.sum():.4f}")
return X_pca
def preprocess_pipeline(self, df, use_pca=False, n_pca_components=0.95):
"""完整的预处理流程"""
logger.info("开始预处理流程...")
# 1. 处理缺失值
df_clean = self.handle_missing_values(df.copy())
# 2. 处理异常值
df_clean = self.handle_outliers(df_clean)
# 3. 编码类别特征
df_clean = self.encode_categorical(df_clean)
# 4. 特征工程
df_clean = self.engineer_features(df_clean)
# 分离特征和标签
if 'label' in df_clean.columns:
X = df_clean.drop('label', axis=1)
y = df_clean['label']
else:
X = df_clean
y = None
# 5. 特征选择
if y is not None:
X_selected = self.select_features(X, y, k=min(20, X.shape[1]))
else:
X_selected = X.values
# 6. 标准化
X_scaled = self.scaler.fit_transform(X_selected)
# 7. PCA降维(可选)
if use_pca and X_scaled.shape[1] > 10:
X_final = self.apply_pca(X_scaled, n_pca_components)
else:
X_final = X_scaled
logger.info(f"预处理完成. 最终特征维度: {X_final.shape}")
if y is not None:
return X_final, y
else:
return X_final
class NetworkAnomalyDetector:
"""网络异常检测器"""
def __init__(self):
self.models = {}
self.results = {}
self.preprocessor = NetworkTrafficPreprocessor()
def load_and_preprocess_data(self):
"""加载并预处理数据"""
logger.info("加载和预处理数据...")
# 生成模拟数据
df = self.preprocessor.load_sample_data()
# 预处理
X, y = self.preprocessor.preprocess_pipeline(df, use_pca=False)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# # 必须将分割后的数据显式赋值给 detector 对象
# detector.X_train = X_train
# detector.X_test = X_test
# detector.y_train = y_train
# detector.y_test = y_test
# detector.X_train = scaler.fit_transform(detector.X_train) # 仅在训练集拟合
# detector.X_test = scaler.transform(detector.X_test) # 在测试集转换
# 确保样本数量一致
assert len(X_train) == len(y_train), "训练集特征和标签样本数不一致"
assert len(X_test) == len(y_test), "测试集特征和标签样本数不一致"
logger.info(f"数据划分: 训练集={X_train.shape}, 测试集={X_test.shape}")
logger.info(f"训练集类别分布: {Counter(y_train)}")
logger.info(f"测试集类别分布: {Counter(y_test)}")
return X_train, X_test, y_train, y_test
def train_models(self, X_train, y_train):
"""训练多个模型"""
logger.info("训练模型...")
# 定义模型
self.models = {
'Random Forest': RandomForestClassifier(
n_estimators=100,
max_depth=5,
random_state=42,
n_jobs=-1,
min_samples_split = 5,
bootstrap=True,
oob_score=True
),
'Gradient Boosting': GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1,
max_depth=5,
random_state=42
),
'SVM': SVC(
kernel='rbf',
C=1.0,
gamma='scale',
probability=True,
random_state=42
),
'Logistic Regression': LogisticRegression(
max_iter=1000,
random_state=42,
n_jobs=-1
),
'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
}
# 训练每个模型
for name, model in self.models.items():
logger.info(f"训练 {name}...")
start_time = time.time()
model.fit(X_train, y_train)
training_time = time.time() - start_time
logger.info(f"{name} 训练完成,耗时: {training_time:.2f}秒")
def evaluate_models(self, X_test, y_test):
"""评估模型性能"""
logger.info("评估模型性能...")
self.results = {}
for name, model in self.models.items():
logger.info(f"评估 {name}...")
# 预测
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
# 计算指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
# 计算AUC(如果模型支持概率预测)
if y_pred_proba is not None:
try:
auc_score = roc_auc_score(y_test, y_pred_proba)
except:
auc_score = None
else:
auc_score = None
# 存储结果
self.results[name] = {
'model': model,
'y_pred': y_pred,
'y_pred_proba': y_pred_proba,
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1': f1,
'auc': auc_score,
'confusion_matrix': confusion_matrix(y_test, y_pred)
}
logger.info(f"{name} 性能: Accuracy={accuracy:.4f}, Precision={precision:.4f}, "
f"Recall={recall:.4f}, F1={f1:.4f}, AUC={auc_score if auc_score else 'N/A'}")
return self.results
def visualize_results(self, y_test, results):
"""可视化结果"""
logger.info("生成可视化结果...")
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('网络流量异常检测模型性能比较', fontsize=16, fontweight='bold')
# 1. 模型性能比较条形图
models = list(results.keys())
accuracies = [results[m]['accuracy'] for m in models]
f1_scores = [results[m]['f1'] for m in models]
x = np.arange(len(models))
width = 0.35
ax1 = axes[0, 0]
bars1 = ax1.bar(x - width / 2, accuracies, width, label='准确率', color='skyblue')
bars2 = ax1.bar(x + width / 2, f1_scores, width, label='F1分数', color='lightcoral')
ax1.set_xlabel('模型')
ax1.set_ylabel('分数')
ax1.set_title('模型性能比较')
ax1.set_xticks(x)
ax1.set_xticklabels(models, rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)
# 添加数值标签
for bars in [bars1, bars2]:
for bar in bars:
height = bar.get_height()
ax1.text(bar.get_x() + bar.get_width() / 2., height + 0.01,
f'{height:.3f}', ha='center', va='bottom', fontsize=9)
# 2. 混淆矩阵热图(只显示最佳模型)
best_model = max(results.keys(), key=lambda m: results[m]['f1'])
cm = results[best_model]['confusion_matrix']
ax2 = axes[0, 1]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
xticklabels=['正常', '异常'], yticklabels=['正常', '异常'])
ax2.set_title(f'{best_model} 混淆矩阵')
ax2.set_xlabel('预测标签')
ax2.set_ylabel('真实标签')
# 3. ROC曲线
ax3 = axes[0, 2]
for name, result in results.items():
if result['y_pred_proba'] is not None:
fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
roc_auc = auc(fpr, tpr)
ax3.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')
ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
ax3.set_xlim([0.0, 1.0])
ax3.set_ylim([0.0, 1.05])
ax3.set_xlabel('假正率')
ax3.set_ylabel('真正率')
ax3.set_title('ROC曲线')
ax3.legend(loc="lower right")
ax3.grid(True, alpha=0.3)
# 4. 特征重要性(随机森林)
if 'Random Forest' in self.models:
rf_model = self.models['Random Forest']
if hasattr(rf_model, 'feature_importances_'):
if self.preprocessor.selected_features:
feature_names = self.preprocessor.selected_features
else:
feature_names = [f'Feature_{i}' for i in range(X_test.shape[1])]
importances = rf_model.feature_importances_
indices = np.argsort(importances)[-10:] # 只显示前10个
ax4 = axes[1, 0]
ax4.barh(range(len(indices)), importances[indices], color='steelblue')
ax4.set_yticks(range(len(indices)))
if self.preprocessor.selected_features and len(feature_names) > max(indices):
ax4.set_yticklabels([feature_names[i] for i in indices])
else:
ax4.set_yticklabels([f'Feature {i}' for i in indices])
ax4.set_xlabel('特征重要性')
ax4.set_title('随机森林特征重要性 (Top 10)')
# 5. 模型训练时间比较
ax5 = axes[1, 1]
# 这里我们可以模拟训练时间,实际应用中可以从日志中获取
training_times = [0.5, 0.8, 2.5, 0.3, 0.4] # 示例时间
bars = ax5.bar(models, training_times, color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'violet'])
ax5.set_xlabel('模型')
ax5.set_ylabel('训练时间 (秒)')
ax5.set_title('模型训练时间比较')
ax5.set_xticklabels(models, rotation=45)
ax5.grid(True, alpha=0.3)
# 6. 详细性能指标表格
ax6 = axes[1, 2]
ax6.axis('tight')
ax6.axis('off')
# 创建表格数据
table_data = []
for name, result in results.items():
table_data.append([
name,
f"{result['accuracy']:.4f}",
f"{result['precision']:.4f}",
f"{result['recall']:.4f}",
f"{result['f1']:.4f}",
f"{result['auc']:.4f}" if result['auc'] else 'N/A'
])
table = ax6.table(
cellText=table_data,
colLabels=['模型', '准确率', '精确率', '召回率', 'F1分数', 'AUC'],
cellLoc='center',
loc='center',
colWidths=[0.15, 0.12, 0.12, 0.12, 0.12, 0.12]
)
table.auto_set_font_size(False)
table.set_fontsize(9)
table.scale(1, 1.5)
ax6.set_title('详细性能指标')
plt.tight_layout()
plt.savefig('network_anomaly_detection_results.png', dpi=300, bbox_inches='tight')
logger.info("结果已保存到 network_anomaly_detection_results.png")
plt.show()
def run_detection_pipeline(self):
"""运行完整的检测流程"""
logger.info("=" * 60)
logger.info("开始网络流量异常检测流程")
logger.info("=" * 60)
# 1. 加载和预处理数据
X_train, X_test, y_train, y_test = self.load_and_preprocess_data()
# 2. 训练模型
self.train_models(X_train, y_train)
# 3. 评估模型
results = self.evaluate_models(X_test, y_test)
# 4. 可视化结果
self.visualize_results(y_test, results)
# 5. 输出最佳模型
best_model = max(results.keys(), key=lambda m: results[m]['f1'])
logger.info("=" * 60)
logger.info(f"最佳模型: {best_model}")
logger.info(f"最佳F1分数: {results[best_model]['f1']:.4f}")
logger.info(f"最佳准确率: {results[best_model]['accuracy']:.4f}")
logger.info("=" * 60)
return results
def main():
"""主函数"""
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
# 创建检测器并运行流程
detector = NetworkAnomalyDetector()
results = detector.run_detection_pipeline()
# 保存结果到文本文件
with open('network_detection_results.txt', 'w', encoding='utf-8') as f:
# 写入分类报告
f.write("=" * 60 + "\n")
f.write("详细分类报告 (最佳模型):\n")
f.write("=" * 60 + "\n\n")
best_model = max(results.keys(), key=lambda m: results[m]['f1'])
y_test = detector.models[best_model].predict(detector.X_test if hasattr(detector, 'X_test') else None)
if hasattr(detector, 'X_test') and hasattr(detector, 'y_test'):
y_pred = results[best_model]['y_pred']
report = classification_report(detector.y_test, y_pred, target_names=['正常', '异常'])
f.write(f"\n{best_model} 分类报告:\n")
f.write(report)
# 写入模型性能指标
f.write("\n\n模型性能指标:\n")
f.write("=" * 60 + "\n")
for name, metrics in results.items():
f.write(f"{name}:\n")
f.write(f" 准确率: {metrics['accuracy']:.4f}\n")
f.write(f" 精确率: {metrics['precision']:.4f}\n")
f.write(f" 召回率: {metrics['recall']:.4f}\n")
f.write(f" F1分数: {metrics['f1']:.4f}\n")
f.write(f" AUC: {metrics['auc'] if metrics['auc'] else 'N/A'}\n")
f.write("-" * 60 + "\n")
# 保存图表
plt.savefig('network_anomaly_detection_results.png', dpi=300, bbox_inches='tight')
logger.info("结果已保存到 network_detection_results.txt 和 network_anomaly_detection_results.png")
if __name__ == "__main__":
main()
#运行结果
F:\pycharm\web1\.venv\Scripts\python.exe F:\pycharm\web1\web.py
2026-02-08 10:56:35,176 - INFO - ============================================================
2026-02-08 10:56:35,176 - INFO - 开始网络流量异常检测流程
2026-02-08 10:56:35,176 - INFO - ============================================================
2026-02-08 10:56:35,176 - INFO - 加载和预处理数据...
2026-02-08 10:56:35,176 - INFO - 生成模拟网络流量数据...
2026-02-08 10:56:35,227 - INFO - 数据生成完成: 11880 个样本, 42 个特征
2026-02-08 10:56:35,230 - INFO - 类别分布: Counter({0: 9900, 1: 1980})
2026-02-08 10:56:35,231 - INFO - 开始预处理流程...
2026-02-08 10:56:35,236 - INFO - 处理缺失值...
2026-02-08 10:56:35,253 - INFO - 缺失值统计:
Empty DataFrame
Columns: [缺失数量, 缺失比例%]
Index: []
2026-02-08 10:56:35,256 - INFO - 处理异常值...
2026-02-08 10:56:35,351 - INFO - 检测到并处理了 17882 个异常值
2026-02-08 10:56:35,351 - INFO - 编码类别特征...
2026-02-08 10:56:35,354 - INFO - 编码类别特征: protocol_type -> 3 个类别
2026-02-08 10:56:35,357 - INFO - 编码类别特征: service -> 5 个类别
2026-02-08 10:56:35,360 - INFO - 编码类别特征: flag -> 5 个类别
2026-02-08 10:56:35,360 - INFO - 执行特征工程...
2026-02-08 10:56:35,366 - INFO - 特征工程后特征数: 49
2026-02-08 10:56:35,371 - INFO - 使用 f_classif 方法选择 20 个最佳特征...
2026-02-08 10:56:35,392 - INFO - 选择的特征: ['duration', 'service', 'flag', 'src_bytes', 'dst_bytes', 'serror_rate', 'srv_serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate', 'bytes_ratio', 'total_error_rate', 'total_rerror_rate', 'duration_log', 'src_bytes_log', 'flag_sum']
2026-02-08 10:56:35,397 - INFO - 预处理完成. 最终特征维度: (11880, 20)
2026-02-08 10:56:35,405 - INFO - 数据划分: 训练集=(9504, 20), 测试集=(2376, 20)
2026-02-08 10:56:35,407 - INFO - 训练集类别分布: Counter({0: 7920, 1: 1584})
2026-02-08 10:56:35,408 - INFO - 测试集类别分布: Counter({0: 1980, 1: 396})
2026-02-08 10:56:35,409 - INFO - 训练模型...
2026-02-08 10:56:35,410 - INFO - 训练 Random Forest...
2026-02-08 10:56:35,901 - INFO - Random Forest 训练完成,耗时: 0.49秒
2026-02-08 10:56:35,901 - INFO - 训练 Gradient Boosting...
2026-02-08 10:56:44,362 - INFO - Gradient Boosting 训练完成,耗时: 8.46秒
2026-02-08 10:56:44,362 - INFO - 训练 SVM...
2026-02-08 10:56:46,921 - INFO - SVM 训练完成,耗时: 2.56秒
2026-02-08 10:56:46,921 - INFO - 训练 Logistic Regression...
2026-02-08 10:56:49,474 - INFO - Logistic Regression 训练完成,耗时: 2.55秒
2026-02-08 10:56:49,475 - INFO - 训练 KNN...
2026-02-08 10:56:49,476 - INFO - KNN 训练完成,耗时: 0.00秒
2026-02-08 10:56:49,476 - INFO - 评估模型性能...
2026-02-08 10:56:49,476 - INFO - 评估 Random Forest...
2026-02-08 10:56:49,551 - INFO - Random Forest 性能: Accuracy=0.9756, Precision=0.9787, Recall=0.9756, F1=0.9762, AUC=0.9986761554943373
2026-02-08 10:56:49,552 - INFO - 评估 Gradient Boosting...
2026-02-08 10:56:49,574 - INFO - Gradient Boosting 性能: Accuracy=0.9870, Precision=0.9879, Recall=0.9870, F1=0.9871, AUC=0.9997028364452608
2026-02-08 10:56:49,574 - INFO - 评估 SVM...
2026-02-08 10:56:50,005 - INFO - SVM 性能: Accuracy=0.9857, Precision=0.9868, Recall=0.9857, F1=0.9859, AUC=0.9974977043158861
2026-02-08 10:56:50,005 - INFO - 评估 Logistic Regression...
2026-02-08 10:56:50,020 - INFO - Logistic Regression 性能: Accuracy=0.9705, Precision=0.9750, Recall=0.9705, F1=0.9715, AUC=0.9862424752576268
2026-02-08 10:56:50,020 - INFO - 评估 KNN...
2026-02-08 10:56:52,684 - INFO - KNN 性能: Accuracy=0.9798, Precision=0.9820, Recall=0.9798, F1=0.9803, AUC=0.9944444444444444
2026-02-08 10:56:52,684 - INFO - 生成可视化结果...
2026-02-08 10:56:57,282 - INFO - 结果已保存到 network_anomaly_detection_results.png
2026-02-08 10:57:03,609 - INFO - ============================================================
2026-02-08 10:57:03,609 - INFO - 最佳模型: Gradient Boosting
2026-02-08 10:57:03,609 - INFO - 最佳F1分数: 0.9871
2026-02-08 10:57:03,609 - INFO - 最佳准确率: 0.9870
2026-02-08 10:57:03,609 - INFO - ============================================================
Traceback (most recent call last):
File "F:\pycharm\web1\web.py", line 682, in <module>
main()
File "F:\pycharm\web1\web.py", line 642, in main
y_test = detector.models[best_model].predict(detector.X_test if hasattr(detector, 'X_test') else None)
File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\ensemble\_gb.py", line 1627, in predict
raw_predictions = self.decision_function(X)
File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\ensemble\_gb.py", line 1580, in decision_function
X = validate_data(
File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\utils\validation.py", line 2954, in validate_data
out = check_array(X, input_name="X", **check_params)
File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\utils\validation.py", line 1068, in check_array
raise ValueError(
ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
这个咋处理