在logitic回归中,想通过图像查看连续性变量x与logitP是否满足线性关系,通过python实现,请Deepseek写了代码,请帮忙看一下这个代码有问题吗?谢谢!
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from statsmodels.nonparametric.smoothers_lowess import lowess
def plot_logit_linearity(data, x_col, y_col, frac=0.3):
"""
绘制连续性自变量与logitP的关系图,检查线性假设
参数:
- data: 包含自变量和因变量的DataFrame
- x_col: 要检查的连续性自变量列名
- y_col: 二分类因变量列名(0/1)
- frac: LOWESS平滑的窗口大小(0-1之间)
"""
# 准备数据
X = data[[x_col]].values
y = data[y_col].values
# 拟合逻辑回归模型
logit_model = LogisticRegression()
logit_model.fit(X, y)
# 计算预测概率和logit转换
pred_prob = logit_model.predict_proba(X)[:, 1]
logit_p = np.log(pred_prob / (1 - pred_prob))
# 创建图形
plt.figure(figsize=(10, 6))
# 绘制原始数据点(抖动处理,便于观察密度)
plt.scatter(X, logit_p, alpha=0.5, color='blue', label='Data points', s=10)
# 计算并绘制LOWESS平滑曲线
lowess_curve = lowess(logit_p.flatten(), X.flatten(), frac=frac)
plt.plot(lowess_curve[:, 0], lowess_curve[:, 1],
color='red', linewidth=2, label='LOWESS Smoothing')
# 绘制线性拟合线
coef = logit_model.coef_[0][0]
intercept = logit_model.intercept_[0]
x_vals = np.array([X.min(), X.max()])
y_vals = intercept + coef * x_vals
plt.plot(x_vals, y_vals, 'g--', linewidth=2, label='Linear Fit')
# 添加图形元素
plt.xlabel(f'Independent Variable: {x_col}', fontsize=12)
plt.ylabel('Logit(P)', fontsize=12)
plt.title(f'Linearity Check: {x_col} vs. Logit(P)', fontsize=14)
plt.legend(fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()
# 示例使用
if __name__ == "__main__":
# 生成示例数据
np.random.seed(42)
n_samples = 500
data = pd.DataFrame({
'age': np.random.normal(45, 15, n_samples),
'disease': np.random.binomial(1, 0.5, n_samples)
})
# 确保年龄为正数
data['age'] = data['age'].clip(lower=18, upper=90)
# 调用函数绘制图形
plot_logit_linearity(data, 'age', 'disease', frac=0.3)