训练一个梯度下降的二分类模型,当神经网络是[n,20,4,1]的cost值是下降收敛的,
但是[n,20,7,5,1]cost值初始为0.69下降到0.64就不会下降了,这是为什么呢?
贴一下代码:
import numpy as np
import matplotlib.pyplot as plt
import h5py
#参数初始化,将所有w/b都封装在一个dict中
def initialize_parameters(layer_dims):
parameters = {}
L = len(layer_dims)
for i in range(1,L):
parameters['w'+ str(i)] = np.random.randn(layer_dims[i],layer_dims[i-1])*0.01
parameters['b'+ str(i)] = np.zeros((layer_dims[i],1))
assert(parameters['w'+ str(i)]).shape == (layer_dims[i],layer_dims[i-1])
assert(parameters['b'+ str(i)]).shape == (layer_dims[i],1)
return parameters
#定义激活函数
def relu(Z):
A=(Z+abs(Z))/2
assert(A.shape == Z.shape)
return A
def sigmoid(Z):
A=1.0/(1+np.exp(-Z))
assert(A.shape == Z.shape)
return A
#向前传播
def forward_propagation(X,parameters):
#caches存储了每一层计算得到的A,Z值
caches = {}
L=len(parameters)//2
A_prev=X
for i in range(1,L):
Z=np.dot(parameters['w'+str(i)],A_prev)+parameters['b'+str(i)]
A=relu(Z)
caches['Z'+str(i)]=Z
caches['A'+str(i)]=A
#这一层计算得到的A需要保留,下一层计算Z要用
A_prev=A
#输出层的激活函数时sigmoid
Z=np.dot(parameters['w'+str(L)],A_prev)+parameters['b'+str(L)]
A=sigmoid(Z)
caches['Z'+str(L)]=Z
caches['A'+str(L)]=A
#这里多存一个X是因为反向传播的时候要用到
caches['A0'] = X
return A,caches
#计算代价
def cpmpute_cost(A,Y):
m=Y.shape[1]
cost=-1/m*np.sum(np.multiply(np.log(A),Y)+np.multiply((1-Y),np.log(1-A)))
cost=np.squeeze(cost)
return cost
#relu函数的导数
def relu_back(Z,dA):
deri = Z
deri[Z < 0]=0
deri[Z >=0]=1
return deri
#反向传播
def back_propagation(Y,caches,parameters):
#所有的dw和db
grads={}
L=len(caches)//2
m=Y.shape[1]
#AL其实就是一次迭代得到的预测值
AL=caches['A'+str(L)]
#因为sigmoid反向传和relu不同,所以单独处理
dZ=AL-Y
dW=np.dot(dZ,caches['A'+str(L-1)].T)/m
db=np.sum(dZ,axis=1,keepdims=True)/m
grads['dw'+str(L)]=dW
grads['db'+str(L)]=db
for i in reversed(range(1,L)):
dA=np.dot(parameters['w'+str(i+1)].T,dZ)
dZ=np.multiply(dA,relu_back(caches['Z'+str(i)],dA))
dW=1.0/m * np.dot(dZ,caches['A'+str(i-1)].T)
db=1.0/m * np.sum(dZ,axis=1,keepdims=True)
grads['dw'+str(i)]=dW
grads['db'+str(i)]=db
return grads
#更新参数
def update_parameters(parameters, grads, alphs):
L = len(parameters)//2
for l in range(L):
parameters['w'+str(l+1)] = parameters['w'+str(l+1)] - alphs * grads['dw'+str(l+1)]
parameters['b'+str(l+1)] = parameters['b'+str(l+1)] - alphs * grads['db'+str(l+1)]
return parameters
#模型预测
def predict(X,parameters):
A2,caches=forward_propagation(X,parameters)
temp=A2.shape[1]
Y_pred=np.zeros([1,temp])
for i in range(temp):
if A2[:,i]>0.5:
Y_pred[:,i]=1
else:
Y_pred[:,i]=0
return Y_pred
#模型整合
def model(X,Y,layer_dims,iter_times,alphs,print_flag):
np.random.seed(1)
parameters=initialize_parameters(layer_dims)
for i in range(0,iter_times):
A,caches=forward_propagation(X,parameters)
cost=cpmpute_cost(A,Y)
grads=back_propagation(Y,caches,parameters)
parameters=update_parameters(parameters,grads,alphs)
if print_flag and i % 100 == 0:
print('iteration at ',i,' cost :',cost)
return parameters
n=train_data_finalX.shape[0]
layer_dims=[n,20,7,5,1]
parameters=model(train_data_finalX,train_data_finalY,layer_dims,2500,0.0075,True)
y_pred_train=predict(train_data_finalX,parameters)
print('train acc is ',np.mean(y_pred_train == train_data_finalY)*100,'%')
y_pred_test=predict(test_data_finalX,parameters)
print('test acc is ',np.mean(y_pred_test == test_data_finalY)*100,'%')
结果类似这样,后面cost值变化很小,在小数点后5位
我试过增加迭代次数和增大学习因子,还是有这个问题
iteration at 0 cost : 0.6932015486338629
iteration at 100 cost : 0.6482987506672847
iteration at 200 cost : 0.6443527436694975
iteration at 300 cost : 0.6439059082659386
iteration at 400 cost : 0.6436651460852033
iteration at 500 cost : 0.6431109804509275