licungeng 2026-02-08 11:04 采纳率: 33.3%
浏览 2

这个机器学习的代码在模型预测阶段的输入数据格式不正确,咋处理


import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

# 导入必要的库
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, IsolationForest
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, auc
)
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import Counter
import pandas as pd
from scipy.stats import pearsonr, chi2_contingency
import logging

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# 设置随机种子确保可重复性
np.random.seed(42)


class NetworkTrafficPreprocessor:
    """网络流量数据预处理器"""

    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.imputer = KNNImputer(n_neighbors=5)
        self.feature_selector = None
        self.pca = None
        self.selected_features = None

    def load_sample_data(self):
        """生成模拟的网络流量数据用于演示"""
        logger.info("生成模拟网络流量数据...")

        n_samples = 10000
        np.random.seed(42)

        # 生成模拟特征
        data = {
            'duration': np.random.exponential(1.0, n_samples),
            'protocol_type': np.random.choice(['tcp', 'udp', 'icmp'], n_samples, p=[0.6, 0.3, 0.1]),
            'service': np.random.choice(['http', 'ftp', 'ssh', 'smtp', 'dns'], n_samples),
            'flag': np.random.choice(['SF', 'S0', 'REJ', 'RSTO', 'RSTR'], n_samples),
            'src_bytes': np.random.lognormal(6, 2, n_samples),
            'dst_bytes': np.random.lognormal(5, 2, n_samples),
            'land': np.random.choice([0, 1], n_samples, p=[0.99, 0.01]),
            'wrong_fragment': np.random.poisson(0.1, n_samples),
            'urgent': np.random.poisson(0.01, n_samples),
            'hot': np.random.poisson(1, n_samples),
            'num_failed_logins': np.random.poisson(0.1, n_samples),
            'logged_in': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
            'num_compromised': np.random.poisson(0.05, n_samples),
            'root_shell': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
            'su_attempted': np.random.choice([0, 1], n_samples, p=[0.98, 0.02]),
            'num_root': np.random.poisson(0.1, n_samples),
            'num_file_creations': np.random.poisson(0.05, n_samples),
            'num_shells': np.random.poisson(0.01, n_samples),
            'num_access_files': np.random.poisson(0.02, n_samples),
            'num_outbound_cmds': np.random.poisson(0.001, n_samples),
            'is_host_login': np.random.choice([0, 1], n_samples, p=[0.99, 0.01]),
            'is_guest_login': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]),
            'count': np.random.poisson(10, n_samples),
            'srv_count': np.random.poisson(5, n_samples),
            'serror_rate': np.random.beta(1, 9, n_samples),
            'srv_serror_rate': np.random.beta(1, 9, n_samples),
            'rerror_rate': np.random.beta(1, 19, n_samples),
            'srv_rerror_rate': np.random.beta(1, 19, n_samples),
            'same_srv_rate': np.random.beta(9, 1, n_samples),
            'diff_srv_rate': np.random.beta(1, 9, n_samples),
            'srv_diff_host_rate': np.random.beta(1, 9, n_samples),
            'dst_host_count': np.random.poisson(100, n_samples),
            'dst_host_srv_count': np.random.poisson(50, n_samples),
            'dst_host_same_srv_rate': np.random.beta(9, 1, n_samples),
            'dst_host_diff_srv_rate': np.random.beta(1, 9, n_samples),
            'dst_host_same_src_port_rate': np.random.beta(9, 1, n_samples),
            'dst_host_srv_diff_host_rate': np.random.beta(1, 9, n_samples),
            'dst_host_serror_rate': np.random.beta(1, 9, n_samples),
            'dst_host_srv_serror_rate': np.random.beta(1, 9, n_samples),
            'dst_host_rerror_rate': np.random.beta(1, 19, n_samples),
            'dst_host_srv_rerror_rate': np.random.beta(1, 19, n_samples)
        }

        df = pd.DataFrame(data)

        # 生成标签(0=正常,1=异常)
        # 基于某些特征组合创建异常样本
        anomaly_condition = (
                (df['src_bytes'] > df['src_bytes'].quantile(0.99)) |
                (df['num_failed_logins'] > 3) |
                (df['serror_rate'] > 0.8) |
                (df['same_srv_rate'] < 0.1)
        )

        df['label'] = np.where(anomaly_condition, 1, 0)

        # 确保类别平衡
        normal_samples = df[df['label'] == 0]
        anomaly_samples = df[df['label'] == 1]

        if len(anomaly_samples) < len(normal_samples) * 0.2:
            # 上采样异常样本
            anomaly_upsampled = resample(
                anomaly_samples,
                replace=True,
                n_samples=int(len(normal_samples) * 0.2),
                random_state=42
            )
            df = pd.concat([normal_samples, anomaly_upsampled])

        logger.info(f"数据生成完成: {df.shape[0]} 个样本, {df.shape[1]} 个特征")
        logger.info(f"类别分布: {Counter(df['label'])}")

        return df

    def handle_missing_values(self, df, threshold=0.3):
        """处理缺失值"""
        logger.info("处理缺失值...")

        # 记录缺失值情况
        missing_stats = df.isnull().sum()
        missing_percent = missing_stats / len(df) * 100

        logger.info(f"缺失值统计:\n{pd.DataFrame({'缺失数量': missing_stats, '缺失比例%': missing_percent})[missing_stats > 0]}")

        # 删除缺失值比例过高的列
        cols_to_drop = missing_percent[missing_percent > threshold * 100].index.tolist()
        if cols_to_drop:
            logger.info(f"删除缺失值过多的列: {cols_to_drop}")
            df = df.drop(columns=cols_to_drop)

        # 对剩余缺失值使用KNN插补
        if df.isnull().sum().sum() > 0:
            logger.info("使用KNN插补处理剩余缺失值...")
            numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

            if numeric_cols:
                imputer = KNNImputer(n_neighbors=5)
                df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
                self.imputer = imputer

        return df

    def handle_outliers(self, df, method='iqr', threshold=3):
        """处理异常值"""
        logger.info("处理异常值...")

        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

        if 'label' in numeric_cols:
            numeric_cols.remove('label')

        outliers_count = 0

        for col in numeric_cols:
            if method == 'iqr':
                # IQR方法
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR

                outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
                outliers_count += outliers

                # 缩尾处理
                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

            elif method == 'zscore':
                # Z-score方法
                mean = df[col].mean()
                std = df[col].std()
                z_scores = np.abs((df[col] - mean) / std)

                outliers = (z_scores > threshold).sum()
                outliers_count += outliers

                # 替换为边界值
                lower_bound = mean - threshold * std
                upper_bound = mean + threshold * std
                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

        logger.info(f"检测到并处理了 {outliers_count} 个异常值")
        return df

    def encode_categorical(self, df):
        """编码类别特征"""
        logger.info("编码类别特征...")

        categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

        for col in categorical_cols:
            if col not in self.label_encoders:
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))
                self.label_encoders[col] = le
                logger.info(f"编码类别特征: {col} -> {len(le.classes_)} 个类别")
            else:
                df[col] = self.label_encoders[col].transform(df[col].astype(str))

        return df

    def engineer_features(self, df):
        """特征工程"""
        logger.info("执行特征工程...")

        # 创建流量比率特征
        if 'src_bytes' in df.columns and 'dst_bytes' in df.columns:
            df['bytes_ratio'] = df['src_bytes'] / (df['dst_bytes'] + 1)  # 避免除零

        if 'count' in df.columns and 'srv_count' in df.columns:
            df['count_ratio'] = df['count'] / (df['srv_count'] + 1)

        # 创建交互特征
        if 'serror_rate' in df.columns and 'srv_serror_rate' in df.columns:
            df['total_error_rate'] = df['serror_rate'] + df['srv_serror_rate']

        if 'rerror_rate' in df.columns and 'srv_rerror_rate' in df.columns:
            df['total_rerror_rate'] = df['rerror_rate'] + df['srv_rerror_rate']

        # 创建统计特征
        if 'duration' in df.columns:
            df['duration_log'] = np.log1p(df['duration'])

        if 'src_bytes' in df.columns:
            df['src_bytes_log'] = np.log1p(df['src_bytes'])

        # 创建标志位组合特征
        flag_cols = [col for col in df.columns if 'flag' in col.lower() or 'urgent' in col.lower()]
        if flag_cols:
            df['flag_sum'] = df[flag_cols].sum(axis=1)

        logger.info(f"特征工程后特征数: {df.shape[1]}")
        return df

    def select_features(self, X, y, k=20, method='f_classif'):
        """特征选择"""
        logger.info(f"使用 {method} 方法选择 {k} 个最佳特征...")

        if method == 'f_classif':
            selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1]))
        elif method == 'mutual_info':
            selector = SelectKBest(score_func=mutual_info_classif, k=min(k, X.shape[1]))
        else:
            selector = SelectKBest(score_func=f_classif, k=min(k, X.shape[1]))

        X_selected = selector.fit_transform(X, y)
        self.feature_selector = selector
        self.selected_features = X.columns[selector.get_support()].tolist()

        logger.info(f"选择的特征: {self.selected_features}")
        return X_selected

    def apply_pca(self, X, n_components=0.95):
        """应用PCA降维"""
        logger.info(f"应用PCA降维,保留 {n_components * 100}% 方差...")

        self.pca = PCA(n_components=n_components, random_state=42)
        X_pca = self.pca.fit_transform(X)

        logger.info(f"PCA降维后维度: {X_pca.shape[1]} (原始: {X.shape[1]})")
        logger.info(f"解释方差比: {self.pca.explained_variance_ratio_.sum():.4f}")

        return X_pca

    def preprocess_pipeline(self, df, use_pca=False, n_pca_components=0.95):
        """完整的预处理流程"""
        logger.info("开始预处理流程...")

        # 1. 处理缺失值
        df_clean = self.handle_missing_values(df.copy())

        # 2. 处理异常值
        df_clean = self.handle_outliers(df_clean)

        # 3. 编码类别特征
        df_clean = self.encode_categorical(df_clean)

        # 4. 特征工程
        df_clean = self.engineer_features(df_clean)

        # 分离特征和标签
        if 'label' in df_clean.columns:
            X = df_clean.drop('label', axis=1)
            y = df_clean['label']
        else:
            X = df_clean
            y = None

        # 5. 特征选择
        if y is not None:

            X_selected = self.select_features(X, y, k=min(20, X.shape[1]))
        else:
            X_selected = X.values

        # 6. 标准化
        X_scaled = self.scaler.fit_transform(X_selected)

        # 7. PCA降维(可选)
        if use_pca and X_scaled.shape[1] > 10:
            X_final = self.apply_pca(X_scaled, n_pca_components)
        else:
            X_final = X_scaled

        logger.info(f"预处理完成. 最终特征维度: {X_final.shape}")

        if y is not None:
            return X_final, y
        else:
            return X_final


class NetworkAnomalyDetector:
    """网络异常检测器"""

    def __init__(self):
        self.models = {}
        self.results = {}
        self.preprocessor = NetworkTrafficPreprocessor()

    def load_and_preprocess_data(self):
        """加载并预处理数据"""
        logger.info("加载和预处理数据...")

        # 生成模拟数据
        df = self.preprocessor.load_sample_data()

        # 预处理
        X, y = self.preprocessor.preprocess_pipeline(df, use_pca=False)

        # 划分训练集和测试集
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        # # 必须将分割后的数据显式赋值给 detector 对象
        # detector.X_train = X_train
        # detector.X_test = X_test
        # detector.y_train = y_train
        # detector.y_test = y_test
        # detector.X_train = scaler.fit_transform(detector.X_train)  # 仅在训练集拟合
        # detector.X_test = scaler.transform(detector.X_test)  # 在测试集转换
        # 确保样本数量一致
        assert len(X_train) == len(y_train), "训练集特征和标签样本数不一致"
        assert len(X_test) == len(y_test), "测试集特征和标签样本数不一致"

        logger.info(f"数据划分: 训练集={X_train.shape}, 测试集={X_test.shape}")
        logger.info(f"训练集类别分布: {Counter(y_train)}")
        logger.info(f"测试集类别分布: {Counter(y_test)}")

        return X_train, X_test, y_train, y_test

    def train_models(self, X_train, y_train):
        """训练多个模型"""
        logger.info("训练模型...")

        # 定义模型
        self.models = {
            'Random Forest': RandomForestClassifier(
                n_estimators=100,
                max_depth=5,
                random_state=42,
                n_jobs=-1,
                min_samples_split = 5,
                bootstrap=True,
                oob_score=True
            ),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=100,
                learning_rate=0.1,
                max_depth=5,
                random_state=42
            ),
            'SVM': SVC(
                kernel='rbf',
                C=1.0,
                gamma='scale',
                probability=True,
                random_state=42
            ),
            'Logistic Regression': LogisticRegression(
                max_iter=1000,
                random_state=42,
                n_jobs=-1
            ),
            'KNN': KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
        }

        # 训练每个模型
        for name, model in self.models.items():
            logger.info(f"训练 {name}...")
            start_time = time.time()
            model.fit(X_train, y_train)
            training_time = time.time() - start_time
            logger.info(f"{name} 训练完成,耗时: {training_time:.2f}秒")

    def evaluate_models(self, X_test, y_test):
        """评估模型性能"""
        logger.info("评估模型性能...")

        self.results = {}

        for name, model in self.models.items():
            logger.info(f"评估 {name}...")

            # 预测
            y_pred = model.predict(X_test)
            y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

            # 计算指标
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')

            # 计算AUC(如果模型支持概率预测)
            if y_pred_proba is not None:
                try:
                    auc_score = roc_auc_score(y_test, y_pred_proba)
                except:
                    auc_score = None
            else:
                auc_score = None

            # 存储结果
            self.results[name] = {
                'model': model,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'auc': auc_score,
                'confusion_matrix': confusion_matrix(y_test, y_pred)
            }

            logger.info(f"{name} 性能: Accuracy={accuracy:.4f}, Precision={precision:.4f}, "
                        f"Recall={recall:.4f}, F1={f1:.4f}, AUC={auc_score if auc_score else 'N/A'}")

        return self.results

    def visualize_results(self, y_test, results):
        """可视化结果"""
        logger.info("生成可视化结果...")

        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        fig.suptitle('网络流量异常检测模型性能比较', fontsize=16, fontweight='bold')

        # 1. 模型性能比较条形图
        models = list(results.keys())
        accuracies = [results[m]['accuracy'] for m in models]
        f1_scores = [results[m]['f1'] for m in models]

        x = np.arange(len(models))
        width = 0.35

        ax1 = axes[0, 0]
        bars1 = ax1.bar(x - width / 2, accuracies, width, label='准确率', color='skyblue')
        bars2 = ax1.bar(x + width / 2, f1_scores, width, label='F1分数', color='lightcoral')

        ax1.set_xlabel('模型')
        ax1.set_ylabel('分数')
        ax1.set_title('模型性能比较')
        ax1.set_xticks(x)
        ax1.set_xticklabels(models, rotation=45)
        ax1.legend()
        ax1.grid(True, alpha=0.3)

        # 添加数值标签
        for bars in [bars1, bars2]:
            for bar in bars:
                height = bar.get_height()
                ax1.text(bar.get_x() + bar.get_width() / 2., height + 0.01,
                         f'{height:.3f}', ha='center', va='bottom', fontsize=9)

        # 2. 混淆矩阵热图(只显示最佳模型)
        best_model = max(results.keys(), key=lambda m: results[m]['f1'])
        cm = results[best_model]['confusion_matrix']

        ax2 = axes[0, 1]
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax2,
                    xticklabels=['正常', '异常'], yticklabels=['正常', '异常'])
        ax2.set_title(f'{best_model} 混淆矩阵')
        ax2.set_xlabel('预测标签')
        ax2.set_ylabel('真实标签')

        # 3. ROC曲线
        ax3 = axes[0, 2]

        for name, result in results.items():
            if result['y_pred_proba'] is not None:
                fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
                roc_auc = auc(fpr, tpr)
                ax3.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

        ax3.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        ax3.set_xlim([0.0, 1.0])
        ax3.set_ylim([0.0, 1.05])
        ax3.set_xlabel('假正率')
        ax3.set_ylabel('真正率')
        ax3.set_title('ROC曲线')
        ax3.legend(loc="lower right")
        ax3.grid(True, alpha=0.3)

        # 4. 特征重要性(随机森林)
        if 'Random Forest' in self.models:
            rf_model = self.models['Random Forest']

            if hasattr(rf_model, 'feature_importances_'):
                if self.preprocessor.selected_features:
                    feature_names = self.preprocessor.selected_features
                else:
                    feature_names = [f'Feature_{i}' for i in range(X_test.shape[1])]

                importances = rf_model.feature_importances_
                indices = np.argsort(importances)[-10:]  # 只显示前10个

                ax4 = axes[1, 0]
                ax4.barh(range(len(indices)), importances[indices], color='steelblue')
                ax4.set_yticks(range(len(indices)))
                if self.preprocessor.selected_features and len(feature_names) > max(indices):
                    ax4.set_yticklabels([feature_names[i] for i in indices])
                else:
                    ax4.set_yticklabels([f'Feature {i}' for i in indices])
                ax4.set_xlabel('特征重要性')
                ax4.set_title('随机森林特征重要性 (Top 10)')

        # 5. 模型训练时间比较
        ax5 = axes[1, 1]

        # 这里我们可以模拟训练时间,实际应用中可以从日志中获取
        training_times = [0.5, 0.8, 2.5, 0.3, 0.4]  # 示例时间

        bars = ax5.bar(models, training_times, color=['skyblue', 'lightgreen', 'lightcoral', 'gold', 'violet'])
        ax5.set_xlabel('模型')
        ax5.set_ylabel('训练时间 (秒)')
        ax5.set_title('模型训练时间比较')
        ax5.set_xticklabels(models, rotation=45)
        ax5.grid(True, alpha=0.3)

        # 6. 详细性能指标表格
        ax6 = axes[1, 2]
        ax6.axis('tight')
        ax6.axis('off')

        # 创建表格数据
        table_data = []
        for name, result in results.items():
            table_data.append([
                name,
                f"{result['accuracy']:.4f}",
                f"{result['precision']:.4f}",
                f"{result['recall']:.4f}",
                f"{result['f1']:.4f}",
                f"{result['auc']:.4f}" if result['auc'] else 'N/A'
            ])

        table = ax6.table(
            cellText=table_data,
            colLabels=['模型', '准确率', '精确率', '召回率', 'F1分数', 'AUC'],
            cellLoc='center',
            loc='center',
            colWidths=[0.15, 0.12, 0.12, 0.12, 0.12, 0.12]
        )
        table.auto_set_font_size(False)
        table.set_fontsize(9)
        table.scale(1, 1.5)
        ax6.set_title('详细性能指标')

        plt.tight_layout()
        plt.savefig('network_anomaly_detection_results.png', dpi=300, bbox_inches='tight')
        logger.info("结果已保存到 network_anomaly_detection_results.png")
        plt.show()

    def run_detection_pipeline(self):
        """运行完整的检测流程"""
        logger.info("=" * 60)
        logger.info("开始网络流量异常检测流程")
        logger.info("=" * 60)

        # 1. 加载和预处理数据
        X_train, X_test, y_train, y_test = self.load_and_preprocess_data()

        # 2. 训练模型
        self.train_models(X_train, y_train)

        # 3. 评估模型
        results = self.evaluate_models(X_test, y_test)

        # 4. 可视化结果
        self.visualize_results(y_test, results)

        # 5. 输出最佳模型
        best_model = max(results.keys(), key=lambda m: results[m]['f1'])
        logger.info("=" * 60)
        logger.info(f"最佳模型: {best_model}")
        logger.info(f"最佳F1分数: {results[best_model]['f1']:.4f}")
        logger.info(f"最佳准确率: {results[best_model]['accuracy']:.4f}")
        logger.info("=" * 60)

        return results


def main():
    """主函数"""
    # 设置中文字体
    plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
    plt.rcParams['axes.unicode_minus'] = False

    # 创建检测器并运行流程
    detector = NetworkAnomalyDetector()
    results = detector.run_detection_pipeline()

    # 保存结果到文本文件
    with open('network_detection_results.txt', 'w', encoding='utf-8') as f:
        # 写入分类报告
        f.write("=" * 60 + "\n")
        f.write("详细分类报告 (最佳模型):\n")
        f.write("=" * 60 + "\n\n")

        best_model = max(results.keys(), key=lambda m: results[m]['f1'])
        y_test = detector.models[best_model].predict(detector.X_test if hasattr(detector, 'X_test') else None)

        if hasattr(detector, 'X_test') and hasattr(detector, 'y_test'):
            y_pred = results[best_model]['y_pred']
            report = classification_report(detector.y_test, y_pred, target_names=['正常', '异常'])
            f.write(f"\n{best_model} 分类报告:\n")
            f.write(report)

            # 写入模型性能指标
            f.write("\n\n模型性能指标:\n")
            f.write("=" * 60 + "\n")
            for name, metrics in results.items():
                f.write(f"{name}:\n")
                f.write(f"  准确率: {metrics['accuracy']:.4f}\n")
                f.write(f"  精确率: {metrics['precision']:.4f}\n")
                f.write(f"  召回率: {metrics['recall']:.4f}\n")
                f.write(f"  F1分数: {metrics['f1']:.4f}\n")
                f.write(f"  AUC: {metrics['auc'] if metrics['auc'] else 'N/A'}\n")
                f.write("-" * 60 + "\n")

        # 保存图表
        plt.savefig('network_anomaly_detection_results.png', dpi=300, bbox_inches='tight')
        logger.info("结果已保存到 network_detection_results.txt 和 network_anomaly_detection_results.png")
    


if __name__ == "__main__":
    main()
#运行结果
F:\pycharm\web1\.venv\Scripts\python.exe F:\pycharm\web1\web.py 
2026-02-08 10:56:35,176 - INFO - ============================================================
2026-02-08 10:56:35,176 - INFO - 开始网络流量异常检测流程
2026-02-08 10:56:35,176 - INFO - ============================================================
2026-02-08 10:56:35,176 - INFO - 加载和预处理数据...
2026-02-08 10:56:35,176 - INFO - 生成模拟网络流量数据...
2026-02-08 10:56:35,227 - INFO - 数据生成完成: 11880 个样本, 42 个特征
2026-02-08 10:56:35,230 - INFO - 类别分布: Counter({0: 9900, 1: 1980})
2026-02-08 10:56:35,231 - INFO - 开始预处理流程...
2026-02-08 10:56:35,236 - INFO - 处理缺失值...
2026-02-08 10:56:35,253 - INFO - 缺失值统计:
Empty DataFrame
Columns: [缺失数量, 缺失比例%]
Index: []
2026-02-08 10:56:35,256 - INFO - 处理异常值...
2026-02-08 10:56:35,351 - INFO - 检测到并处理了 17882 个异常值
2026-02-08 10:56:35,351 - INFO - 编码类别特征...
2026-02-08 10:56:35,354 - INFO - 编码类别特征: protocol_type -> 3 个类别
2026-02-08 10:56:35,357 - INFO - 编码类别特征: service -> 5 个类别
2026-02-08 10:56:35,360 - INFO - 编码类别特征: flag -> 5 个类别
2026-02-08 10:56:35,360 - INFO - 执行特征工程...
2026-02-08 10:56:35,366 - INFO - 特征工程后特征数: 49
2026-02-08 10:56:35,371 - INFO - 使用 f_classif 方法选择 20 个最佳特征...
2026-02-08 10:56:35,392 - INFO - 选择的特征: ['duration', 'service', 'flag', 'src_bytes', 'dst_bytes', 'serror_rate', 'srv_serror_rate', 'srv_rerror_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_srv_rerror_rate', 'bytes_ratio', 'total_error_rate', 'total_rerror_rate', 'duration_log', 'src_bytes_log', 'flag_sum']
2026-02-08 10:56:35,397 - INFO - 预处理完成. 最终特征维度: (11880, 20)
2026-02-08 10:56:35,405 - INFO - 数据划分: 训练集=(9504, 20), 测试集=(2376, 20)
2026-02-08 10:56:35,407 - INFO - 训练集类别分布: Counter({0: 7920, 1: 1584})
2026-02-08 10:56:35,408 - INFO - 测试集类别分布: Counter({0: 1980, 1: 396})
2026-02-08 10:56:35,409 - INFO - 训练模型...
2026-02-08 10:56:35,410 - INFO - 训练 Random Forest...
2026-02-08 10:56:35,901 - INFO - Random Forest 训练完成,耗时: 0.49秒
2026-02-08 10:56:35,901 - INFO - 训练 Gradient Boosting...
2026-02-08 10:56:44,362 - INFO - Gradient Boosting 训练完成,耗时: 8.46秒
2026-02-08 10:56:44,362 - INFO - 训练 SVM...
2026-02-08 10:56:46,921 - INFO - SVM 训练完成,耗时: 2.56秒
2026-02-08 10:56:46,921 - INFO - 训练 Logistic Regression...
2026-02-08 10:56:49,474 - INFO - Logistic Regression 训练完成,耗时: 2.55秒
2026-02-08 10:56:49,475 - INFO - 训练 KNN...
2026-02-08 10:56:49,476 - INFO - KNN 训练完成,耗时: 0.00秒
2026-02-08 10:56:49,476 - INFO - 评估模型性能...
2026-02-08 10:56:49,476 - INFO - 评估 Random Forest...
2026-02-08 10:56:49,551 - INFO - Random Forest 性能: Accuracy=0.9756, Precision=0.9787, Recall=0.9756, F1=0.9762, AUC=0.9986761554943373
2026-02-08 10:56:49,552 - INFO - 评估 Gradient Boosting...
2026-02-08 10:56:49,574 - INFO - Gradient Boosting 性能: Accuracy=0.9870, Precision=0.9879, Recall=0.9870, F1=0.9871, AUC=0.9997028364452608
2026-02-08 10:56:49,574 - INFO - 评估 SVM...
2026-02-08 10:56:50,005 - INFO - SVM 性能: Accuracy=0.9857, Precision=0.9868, Recall=0.9857, F1=0.9859, AUC=0.9974977043158861
2026-02-08 10:56:50,005 - INFO - 评估 Logistic Regression...
2026-02-08 10:56:50,020 - INFO - Logistic Regression 性能: Accuracy=0.9705, Precision=0.9750, Recall=0.9705, F1=0.9715, AUC=0.9862424752576268
2026-02-08 10:56:50,020 - INFO - 评估 KNN...
2026-02-08 10:56:52,684 - INFO - KNN 性能: Accuracy=0.9798, Precision=0.9820, Recall=0.9798, F1=0.9803, AUC=0.9944444444444444
2026-02-08 10:56:52,684 - INFO - 生成可视化结果...
2026-02-08 10:56:57,282 - INFO - 结果已保存到 network_anomaly_detection_results.png
2026-02-08 10:57:03,609 - INFO - ============================================================
2026-02-08 10:57:03,609 - INFO - 最佳模型: Gradient Boosting
2026-02-08 10:57:03,609 - INFO - 最佳F1分数: 0.9871
2026-02-08 10:57:03,609 - INFO - 最佳准确率: 0.9870
2026-02-08 10:57:03,609 - INFO - ============================================================
Traceback (most recent call last):
  File "F:\pycharm\web1\web.py", line 682, in <module>
    main()
  File "F:\pycharm\web1\web.py", line 642, in main
    y_test = detector.models[best_model].predict(detector.X_test if hasattr(detector, 'X_test') else None)
  File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\ensemble\_gb.py", line 1627, in predict
    raw_predictions = self.decision_function(X)
  File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\ensemble\_gb.py", line 1580, in decision_function
    X = validate_data(
  File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\utils\validation.py", line 2954, in validate_data
    out = check_array(X, input_name="X", **check_params)
  File "F:\pycharm\web1\.venv\lib\site-packages\sklearn\utils\validation.py", line 1068, in check_array
    raise ValueError(
ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

这个咋处理

  • 写回答

1条回答 默认 最新

  • 檀越@新空间 2026-02-08 11:05
    关注

    上午好☀️☀️☀️️
    本答案参考通义千问

    在模型预测阶段,输入数据格式不正确通常是因为训练时使用的特征与预测时的特征不一致。这可能包括以下几种情况:

    • 缺失特征:预测数据缺少某些在训练数据中出现的特征。
    • 多余特征:预测数据包含了训练数据中没有的特征。
    • 特征顺序不同:特征列的顺序与训练时不同。
    • 数据类型不一致:例如,某列在训练时是数值型,而在预测时是字符串型。

    为了解决这个问题,我们可以采取以下几个步骤来确保预测阶段的数据格式正确。


    ✅ 问题分析

    你提到的问题出现在“模型预测阶段的输入数据格式不正确”,这通常意味着你在使用模型进行预测时,传入的数据与训练模型时所用的数据结构不一致。比如:

    • 在训练模型时,数据包含 protocol_typeservice 等分类变量,但在预测时没有这些字段。
    • 或者数据中存在未处理的缺失值(如 NaN)或非数值型字段。
    • 数据格式错误,例如将 DataFrame 错误地转换为 numpy 数组而丢失了列名。

    🔧 解决方案

    1. 确保预测数据与训练数据具有相同的列结构

    ✅ 操作步骤:

    • 在训练模型前,保存训练数据的列名(feature_names)。
    • 在预测阶段,确保输入数据包含相同数量和名称的列。
    • 如果预测数据中缺少某些列,可以手动添加并填充默认值(如0或空字符串)。
    • 如果预测数据中有多余的列,应删除它们。

    📌 示例代码:

    # 假设 model 是训练好的模型
    # train_data 是训练时的 DataFrame
    
    # 保存特征列
    feature_columns = train_data.columns.tolist()
    
    # 预测数据预处理
    def preprocess_for_prediction(prediction_df):
        # 确保列名一致
        prediction_df = prediction_df.reindex(columns=feature_columns, fill_value=0)
        
        # 处理缺失值(可选)
        imputer = SimpleImputer(strategy='mean')
        prediction_df = pd.DataFrame(imputer.fit_transform(prediction_df), columns=prediction_df.columns)
        
        return prediction_df
    

    2. 对预测数据进行与训练数据相同的预处理

    ✅ 操作步骤:

    • 在训练模型之前,对训练数据进行标准化、编码、缺失值处理等操作。
    • 在预测阶段,使用同样的预处理器(如 StandardScalerLabelEncoder)对预测数据进行处理。
    • 注意:不要重新训练预处理器,而是使用已训练的实例。

    📌 示例代码:

    # 假设 scaler 和 label_encoders 是训练阶段已训练的预处理器
    def preprocess_prediction_data(df, scaler, label_encoders):
        # 对分类变量进行编码
        for col, encoder in label_encoders.items():
            if col in df.columns:
                df[col] = encoder.transform(df[col].astype(str))
        
        # 标准化数值特征
        df = pd.DataFrame(scaler.transform(df), columns=df.columns)
        
        return df
    

    3. 验证预测数据是否符合模型要求

    ✅ 操作步骤:

    • 使用 model.predict() 前,打印预测数据的形状和列名,确保其与训练数据一致。
    • 可以使用 print(prediction_df.shape)print(prediction_df.columns) 来检查。

    📌 示例代码:

    # 假设 prediction_df 是预测数据
    print("预测数据形状:", prediction_df.shape)
    print("预测数据列名:", prediction_df.columns)
    

    4. 处理缺失值和异常值

    ✅ 操作步骤:

    • 如果预测数据中有缺失值,使用训练时使用的 SimpleImputerKNNImputer 进行填充。
    • 如果预测数据中有异常值(如极大/极小值),可以使用 IsolationForestZ-score 方法检测并处理。

    📌 示例代码:

    from sklearn.impute import KNNImputer
    
    # 使用训练时的 imputer 填充预测数据
    imputer = KNNImputer(n_neighbors=5)
    prediction_df = pd.DataFrame(imputer.transform(prediction_df), columns=prediction_df.columns)
    

    5. 使用 Pipeline 统一处理流程

    ✅ 操作步骤:

    • 将数据预处理、特征选择、模型训练封装成一个 Pipeline
    • 在预测阶段,直接调用该 Pipeline 处理预测数据。

    📌 示例代码:

    from sklearn.pipeline import Pipeline
    
    # 构建管道
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier())
    ])
    
    # 训练模型
    pipeline.fit(train_data, train_labels)
    
    # 预测数据
    prediction = pipeline.predict(prediction_data)
    

    📦 修改后的完整代码示例

    假设你有一个完整的机器学习流程,以下是修改后的代码片段,用于保证预测数据格式正确:

    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.impute import KNNImputer
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    
    # 假设这是训练数据
    train_data = pd.DataFrame({
        'duration': [1, 2, 3],
        'protocol_type': ['tcp', 'udp', 'icmp'],
        'service': ['http', 'ftp', 'ssh']
    })
    train_labels = pd.Series([0, 1, 0])
    
    # 假设这是预测数据(可能格式不一致)
    prediction_data = pd.DataFrame({
        'duration': [4],
        'protocol_type': ['tcp'],
        # 缺少 'service' 列
    })
    
    # 定义预处理和模型
    class Preprocessor:
        def __init__(self):
            self.scaler = StandardScaler()
            self.label_encoders = {}
            self.imputer = KNNImputer(n_neighbors=5)
    
        def fit(self, data):
            # 对分类变量进行编码
            for col in data.select_dtypes(include=['object']).columns:
                le = LabelEncoder()
                data[col] = le.fit_transform(data[col])
                self.label_encoders[col] = le
            
            # 标准化
            self.scaler.fit(data)
            
            # 填充缺失值
            self.imputer.fit(data)
        
        def transform(self, data):
            # 对分类变量进行编码
            for col, encoder in self.label_encoders.items():
                if col in data.columns:
                    data[col] = encoder.transform(data[col].astype(str))
            
            # 填充缺失值
            data = pd.DataFrame(self.imputer.transform(data), columns=data.columns)
            
            # 标准化
            data = pd.DataFrame(self.scaler.transform(data), columns=data.columns)
            
            return data
    
    # 创建预处理器并拟合训练数据
    preprocessor = Preprocessor()
    preprocessor.fit(train_data)
    
    # 预测数据预处理
    processed_prediction = preprocessor.transform(prediction_data)
    
    # 构建模型
    model = RandomForestClassifier()
    model.fit(preprocessor.transform(train_data), train_labels)
    
    # 进行预测
    predictions = model.predict(processed_prediction)
    print("预测结果:", predictions)
    

    ✅ 总结

    | 步骤 | 描述 | |------|------| | 1 | 确保预测数据与训练数据列名一致 | | 2 | 使用相同的预处理器对预测数据进行处理 | | 3 | 验证预测数据格式是否符合模型要求 | | 4 | 处理缺失值和异常值 | | 5 | 使用 Pipeline 统一处理流程 |

    通过以上方法,你可以解决“模型预测阶段的输入数据格式不正确”的问题,确保模型能够正常运行。

    评论

报告相同问题?

问题事件

  • 创建了问题 2月8日