如图
以下是代码
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
import seaborn as sns
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from sklearn.model_selection import cross_val_predict
save_path=r"E:\研究生资料\数据清洗\CGSS\CGSS2010\df_result.csv"
path=r"E:\研究生资料\数据清洗\CGSS\CGSS2010\df_estima.csv"
varimp_path=r"E:\研究生资料\数据清洗\CGSS\CGSS2010"
df=pd.read_csv(path,index_col=0)
df=df[df["income"]>0]
df.index=np.arange(len(df))
#-------------------------描述性统计-------------------------
#生成虚拟变量:
init_var=list()
dumm_var=["fedu","fwork","ftype","medu","mtype",
'mwork'
]
for i in dumm_var:
df=pd.concat([df,pd.get_dummies(df[i],prefix=i)],axis=1)
remove_var=['id','provinces','hu','work','income']
remove_var.extend(dumm_var)
v=list(df.columns)
for i in remove_var:
v.remove(i)
income=np.log(df["income"].values)
x=df[v].values
#-------------------------Python:回归树-------------------------
clf = tree.DecisionTreeRegressor()
clf2 = clf.fit(x,income.ravel())
# py_rt_pre=clf.predict(x)
py_rt_pre=cross_val_predict(clf2,x,income.ravel(), cv=10)
#-------------------------Python:随机森林-------------------------
rfc=RandomForestRegressor(n_estimators=1000,random_state=2)
#n_estimators 迭代次数,即决策树的数量;random_state 控制随机性
rfc2= rfc.fit(x,income.ravel())
# py_rf_pre=rfc.predict(x)
py_rf_pre=cross_val_predict(rfc2,x,income.ravel(), cv=10)
varimp_rf=rfc2.feature_importances_ #特征筛选
# df_varimp_rf
df_varimp_rf=DataFrame({"var":v,"varimp":list(varimp_rf)})
df_varimp_rf=df_varimp_rf.sort_values(by=["varimp"],ascending=False)
#ascending=true 表示降序排列;ascending=false 表示升序排序
#------------------------------R:party------------------------------
party=importr("party")
base=importr("base")
stats=importr("stats")
df_r=df[v]
df_r.loc[:,"lny"]=income
with localconverter(ro.default_converter + pandas2ri.converter):
df_r_use=ro.conversion.py2rpy(df_r)
x_name=list(df_r.columns)
x_name.remove("lny")
formula_str="+".join(x_name)
formula_str="lny~"+formula_str
#----------------------party:条件推断树----------------------
base.set_seed(1)
ct=party.ctree(formula=stats.as_formula(formula_str),
data=df_r_use,
control=party.ctree_control(
mincriterion=0.90,
testtype="Bonferroni",
)
)
pa_ct_pre=stats.predict(ct)
pa_ct_pre=np.array(pa_ct_pre)
#----------------------party:条件推断森林----------------------
base.set_seed(1)
ctf=party.cforest(formula=stats.as_formula(formula_str),
data=df_r_use,
control=party.cforest_control(
mincriterion=0.90,
mtry=len(v)**(0.5),
testtype="Bonferroni",
ntree=1000,
fraction=0.5,
replace=False
)
)
pa_cf_pre=stats.predict(ctf)
pa_cf_pre=np.array(pa_cf_pre)
varimp_cf=party.varimp(ctf)
df_varimp_cf=DataFrame({"var":x_name,"varimp":list(varimp_cf)})
df_varimp_cf.loc[:,"varimp_sd"]=df_varimp_cf["varimp"]/np.sum(df_varimp_cf["varimp"])
df_varimp_cf=df_varimp_cf.sort_values(by=["varimp"],ascending=False)
#整理最终数据表
df=df[init_var]
df.loc[:,"py_rt"]=py_rt_pre
df.loc[:,"py_rf"]=py_rf_pre
df.loc[:,"pa_ct"]=pa_ct_pre
df.loc[:,"pa_cf"]=pa_cf_pre
# sns.displot(data=df,x="py_rt",kde=True)
# sns.displot(data=df,x="py_rf",kde=True)
# sns.displot(data=df,x="pa_ct",kde=True)
# sns.displot(data=df,x="pa_cf",kde=True)
# income_pre=df[["income","py_rt","py_rf","pa_ct","pa_cf"]]
# f=lambda x:x.describe()
# income_pre.apply(f)
# income_pre.apply(np.quantile,q=(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9))
df.to_csv(save_path,header=True)
df_varimp_cf.to_csv(varimp_path+"\\varimp_cf.csv",header=True)
df_varimp_rf.to_csv(varimp_path+"\\varimp_rf.csv",header=True)