问题遇到的现象和发生背景
遇到的现象和发生背景,请写出第一个错误信息
用代码块功能插入代码,请勿粘贴截图。 不用代码块回答率下降 50%
运行结果及详细报错内容
我的解答思路和尝试过的方法,不写自己思路的,回答率下降 60%
我想要达到的结果,如果你需要快速回答,请尝试 “付费悬赏”
# 导入核心库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
# 1. 数据加载与初步探索
df = pd.read_csv('Fashion_Retail_Sales.csv')
print(df.info())
print(df.describe())
# 2. 数据清洗与预处理
# 处理日期和缺失值(示例:处理Rating缺失)
df['Date Purchased'] = pd.to_datetime(df['Date Purchased'], errors='coerce')
df['Review Rating'].fillna(df.groupby('Item Purchased')['Review Rating'].transform('mean'), inplace=True)
df['Is_Rating_Missing'] = df['Review Rating'].isnull().astype(int)
# 定义目标变量(假设中位数为阈值)
median_amount = df['Purchase Amount (USD)'].median()
df['High_Value_Flag'] = (df['Purchase Amount (USD)'] > median_amount).astype(int)
# 3. 特征工程 - 聚合到客户级别
# 按客户ID聚合
customer_features = df.groupby('Customer Reference ID').agg({
'Purchase Amount (USD)': ['count', 'mean'], # 交易次数,平均金额
'Item Purchased': lambda x: x.mode()[0], # 最喜爱的类别
'Payment Method': lambda x: x.mode()[0], # 最常用支付方式
'Review Rating': 'mean', # 平均评分
'Date Purchased': 'max' # 最近购买日期
}).reset_index()
# 重命名列
customer_features.columns = ['CustomerID', 'Total_Transactions', 'Avg_Spending',
'Favorite_Category', 'Preferred_Payment', 'Avg_Rating', 'Last_Purchase_Date']
# 计算Recency(假设当前日期为2023-12-31)
customer_features['Recency'] = (pd.to_datetime('2023-12-31') - customer_features['Last_Purchase_Date']).dt.days
customer_features.drop('Last_Purchase_Date', axis=1, inplace=True)
# 编码分类变量
label_encoders = {}
for col in ['Favorite_Category', 'Preferred_Payment']:
le = LabelEncoder()
customer_features[col] = le.fit_transform(customer_features[col].astype(str))
label_encoders[col] = le
# 将目标变量合并回来(取该客户最后一次交易的价值标签)
customer_target = df.sort_values('Date Purchased').groupby('Customer Reference ID')['High_Value_Flag'].last()
customer_df = customer_features.merge(customer_target, left_on='CustomerID', right_index=True)
# 4. 模型训练与评估
X = customer_df.drop(['CustomerID', 'High_Value_Flag'], axis=1)
y = customer_df['High_Value_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# 标准化(对线性模型很重要)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# 逻辑回归(基线模型)
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression AUC: ", roc_auc_score(y_test, lr_model.predict_proba(X_test_scaled)[:, 1]))
# 随机森林
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train) # 树模型不需要标准化
y_pred_rf = rf_model.predict(X_test)
print("Random Forest AUC: ", roc_auc_score(y_test, rf_model.predict_proba(X_test)[:, 1]))
# 5. 特征重要性可视化
feature_importances = pd.DataFrame({
'feature': X.columns,
'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['feature'], feature_importances['importance'])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()