我在用python实现SVRG算法的时候同时和SGD进行比较,在网络结构与激活函数相同的情况下SGD可以收敛,为什么SVRG不能收敛,我确定是按照论文上的算法进行编写,并且与多人的程序(虽然那些程序从来没有运行成功过)进行比较,但是仍然没有用我都不知道怎么解决了
import numpy as np
for i in range(max_iterations):
batch_mask = np.random.choice(train_size, batch_size)
x_batch = x_train[batch_mask]
t_batch = t_train[batch_mask]
"""随机梯度下降法(Stochastic Gradient Descent)"""
grads = networks['SGD'].gradient(x_batch, t_batch)
optimizers['SGD'].update(networks['SGD'].params, grads)
loss = networks['SGD'].loss(x_batch, t_batch) #获得训练误差
train_loss['SGD'].append(loss) #存储训练误差
maxiter = batch_size*2
"""SVRG"""
fullgrad = networks['SVRG'].gradient(x_batch, t_batch)
w = copy.deepcopy(networks['SVRG'].params)
wtemp = copy.deepcopy(w)
for j in range(100):
index = np.random.choice(batch_size, 1)
x_index = x_batch[index]
t_index = t_batch[index]
networks['SVRG'].params = copy.deepcopy(wtemp)
g1 = networks['SVRG'].gradient(x_index, t_index)
networks['SVRG'].params = copy.deepcopy(w)
g2 = networks['SVRG'].gradient(x_index, t_index)
optimizers['SVRG'].update(networks['SVRG'].params, g1, g2, fullgrad) # 对参数进行更新
#params[key] = params[key] - self.lr * (g1[key] - (g2[key] - fullgrad[key]))
wtemp = copy.deepcopy(networks['SVRG'].params)
w = copy.deepcopy(networks['SVRG'].params)
loss = networks['SVRG'].loss(x_batch, t_batch) # 获得SVRG训练误差
train_loss['SVRG'].append(loss) # 存储SVRG训练误差
能有人帮我彻底解决吗,急需