网上下载了一个深度学习代码,用于多任务权重梯度归一化 https://github.com/brianlan/pytorch-grad-norm
作者应该是参照这个论文写的GradNorm: Gradient Normalization for Adaptive Loss Balancing in Deep Multitask Networks
然后debug的时候,有个地方自动求梯度,在最后一行那里
for t in range(n_iterations):
# get a single batch
for (it, batch) in enumerate(data_loader):
# get the X and the targets values
X = batch[0]
ts = batch[1]
if torch.cuda.is_available():
X = X.cuda()
ts = ts.cuda()
# evaluate each task loss L_i(t)
task_loss = model(X, ts) # this will do a forward pass in the model and will also evaluate the loss
# compute the weighted loss w_i(t) * L_i(t)
weighted_task_loss = torch.mul(model.weights, task_loss)
# initialize the initial loss L(0) if t=0
if t == 0:
# set L(0)
if torch.cuda.is_available():
initial_task_loss = task_loss.data.cpu()
else:
initial_task_loss = task_loss.data
initial_task_loss = initial_task_loss.numpy()
# get the total loss
loss = torch.sum(weighted_task_loss)
# clear the gradients
optimizer.zero_grad()
# do the backward pass to compute the gradients for the whole set of weights
# This is equivalent to compute each \nabla_W L_i(t)
loss.backward(retain_graph=True)
# set the gradients of w_i(t) to zero because these gradients have to be updated using the GradNorm loss
#print('Before turning to 0: {}'.format(model.weights.grad))
model.weights.grad.data = model.weights.grad.data * 0.0
#print('Turning to 0: {}'.format(model.weights.grad))
# switch for each weighting algorithm:
# --> grad norm
if args.mode == 'grad_norm':
# get layer of shared weights
W = model.get_last_shared_layer()
print(W.parameters())
# get the gradient norms for each of the tasks
# G^{(i)}_w(t)
norms = []
for i in range(len(task_loss)):
# get the gradient of this task loss with respect to the shared parameters
gygw = torch.autograd.grad(task_loss[i], W.parameters(), retain_graph=True)
# compute the norm
norms.append(torch.norm(torch.mul(model.weights[i], gygw[0])))
norms = torch.stack(norms)
#print('G_w(t): {}'.format(norms))
# compute the inverse training rate r_i(t)
# \curl{L}_i
if torch.cuda.is_available():
loss_ratio = task_loss.data.cpu().numpy() / initial_task_loss
else:
loss_ratio = task_loss.data.numpy() / initial_task_loss
# r_i(t)
inverse_train_rate = loss_ratio / np.mean(loss_ratio)
#print('r_i(t): {}'.format(inverse_train_rate))
# compute the mean norm \tilde{G}_w(t)
if torch.cuda.is_available():
mean_norm = np.mean(norms.data.cpu().numpy())
else:
mean_norm = np.mean(norms.data.numpy())
#print('tilde G_w(t): {}'.format(mean_norm))
# compute the GradNorm loss
# this term has to remain constant
constant_term = torch.tensor(mean_norm * (inverse_train_rate ** args.alpha), requires_grad=False)
if torch.cuda.is_available():
constant_term = constant_term.cuda()
#print('Constant term: {}'.format(constant_term))
#this is the GradNorm loss itself
grad_norm_loss = torch.tensor(torch.sum(torch.abs(norms - constant_term)))
#print('GradNorm loss {}'.format(grad_norm_loss))
# compute the gradient for the weights
model.weights.grad = torch.autograd.grad(grad_norm_loss, model.weights)[0]
先是报错runtimeerror:element 0 of tensors does not require grad and does not have a grad_fn,我检查了之后估计是grad_norm_loss这个变量require_grad=False,我改成了True,但是还报错runtimeError: One of the differentiated Tensors appears to not have been used in the graph. Set allow_unused=True if this is the desired behavior.我就加上了allow_unsed=True,因此我把最后一行改成这样
model.weights.grad = torch.autograd.grad( Variable(torch.tensor(torch.sum(torch.abs(norms - constant_term))), requires_grad=True), model.weights)[0]
但是求出来的梯度=none,这样子就导致我这个model.weights就不能更新了,我不知道是怎么回事,求求各位大神帮忙看看