问题遇到的现象和发生背景
一个信用风控问题,给出了训练集和测试集,在通过分箱时,计算iv值发生了问题
import pandas as pd
import numpy as np
X_train_df0=pd.read_csv("train_X.csv")
y_train_df0=pd.read_csv("train_y.csv")
X_train_df0
train_df0=pd.merge(X_train_df0,y_train_df0,on='ID')
train_df0
![img](https://img-mid.csdnimg.cn/release/static/image/mid/ask/553695992346191.png "#left")
运行结果及报错内容
##定义分箱函数,计算iv值
def cal_iv(data,cut_num,feature,target):
#1.数据分箱
data_cut=pd.cut(data[feature],cut_num)
#2.统计各个分箱的违约样本,未违约样本
cut_group_all=data[target].groupby(data_cut).count()
cut_group_y=data[target].groupby(data_cut).sum()
cut_group_n=cut_group_all=cut_group_y
#3.统计样本比率
df=pd.DataFrame()
df['bad']=list(cut_group_y)
df['good']=list(cut_group_n)
df['all']=list(cut_group_all)
df['违约%']=df['bad'] /df['bad'].sum()
df['不违约%']=df['good'] /df['good'].sum()
#4.计算WOE
df['WOE']=np.log(df['违约%']/df['不违约%'])
df=df.replace({'WOE':{np.inf:0,-np.inf:0}})
#5.计算各个分箱的iv值
df['IV']=df['WOE']*(df['违约%']-df['不违约%'])
iv=df['IV'].sum()
return iv
print(cal_iv(train_df0,10,'x2','y_1'))
iv_list=[]
df_iv=pd.DataFrame()
for i in train_df0.columns[:-1]:
a=cal_iv(train_df0,10,i,'y-1')
iv_list.append(a)
df_iv['feature']=list(train_df0.columns[:-1])
df_iv['IV']=iv_list
b=df_iv.sort_values(by='IV',ascending=False)
b.head(10)
报错结果:
KeyError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2894 try:
-> 2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'y_1'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-31-837000a62aae> in <module>
21 iv=df['IV'].sum()
22 return iv
---> 23 print(cal_iv(train_df0,10,'x2','y_1'))
24 iv_list=[]
25 df_iv=pd.DataFrame()
<ipython-input-31-837000a62aae> in cal_iv(data, cut_num, feature, target)
4 data_cut=pd.cut(data[feature],cut_num)
5 #2.统计各个分箱的违约样本,未违约样本
----> 6 cut_group_all=data[target].groupby(data_cut).count()
7 cut_group_y=data[target].groupby(data_cut).sum()
8 cut_group_n=cut_group_all=cut_group_y
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2900 if self.columns.nlevels > 1:
2901 return self._getitem_multilevel(key)
-> 2902 indexer = self.columns.get_loc(key)
2903 if is_integer(indexer):
2904 indexer = [indexer]
~\anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
2895 return self._engine.get_loc(casted_key)
2896 except KeyError as err:
-> 2897 raise KeyError(key) from err
2898
2899 if tolerance is not None:
KeyError: 'y_1'
我的解答思路和尝试过的方法
我是照着别人的代码扒的,原代码数据名称和我不一样,y-1我理解的是,train-y中有一列是y,好用户是0,坏用户是1.原代码此处为'Loan_status_Charged Off'