问题遇到的现象和发生背景
准备预先训练训练为了生成食物画像的CookGAN的encoder部分,但是实行后发现loss反向传播出现问题无法解决。
这里是源代码的地址:
https://github.com/klory/CookGAN
问题相关代码
# train_retrieval.py
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch import optim
from tqdm import tqdm
import numpy as np
import os
import pdb
import wandb
from args_retrieval import get_parser
from datasets_retrieval import Dataset, train_transform
from models_retrieval import TextEncoder, ImageEncoder
from triplet_loss import global_loss, TripletLoss
from modules import DynamicSoftMarginLoss
import sys
sys.path.append('../')
from common import param_counter, sample_data
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
"""
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
"""
def create_model(ckpt_args, device='cuda'):
text_encoder = TextEncoder(
# self, data_dir, text_info, hid_dim, emb_dim, z_dim, with_attention, ingr_enc_type
data_dir= 'E:/CookGAN/retrieval_model/models',
emb_dim=ckpt_args.word2vec_dim, # 300
hid_dim=ckpt_args.rnn_hid_dim, # 300
z_dim=ckpt_args.feature_dim, # 1024
# word2vec_file=ckpt_args.word2vec_file,
text_info=ckpt_args.text_info,
with_attention=ckpt_args.with_attention,
ingr_enc_type=ckpt_args.ingrs_enc_type)
image_encoder = ImageEncoder(
z_dim=ckpt_args.feature_dim)
text_encoder, image_encoder = [x.to(device) for x in [text_encoder, image_encoder]]
print('# text_encoder', param_counter(text_encoder.parameters()))
print('# image_encoder', param_counter(image_encoder.parameters()))
if device == 'cuda':
text_encoder, image_encoder = [nn.DataParallel(x) for x in [text_encoder, image_encoder]]
optimizer = torch.optim.Adam([
{'params': text_encoder.parameters()},
{'params': image_encoder.parameters()},
], lr=ckpt_args.lr, betas=(0.5, 0.999))
return text_encoder, image_encoder, optimizer
def load_model(ckpt_path, device='cuda'):
print('load retrieval model from:', ckpt_path)
ckpt = torch.load(ckpt_path)
ckpt_args = ckpt['args']
batch_idx = ckpt['batch_idx']
text_encoder, image_encoder, optimizer = create_model(ckpt_args, device)
if device=='cpu':
text_encoder.load_state_dict(ckpt['text_encoder'])
image_encoder.load_state_dict(ckpt['image_encoder'])
else:
text_encoder.module.load_state_dict(ckpt['text_encoder'])
image_encoder.module.load_state_dict(ckpt['image_encoder'])
optimizer.load_state_dict(ckpt['optimizer'])
return ckpt_args, batch_idx, text_encoder, image_encoder, optimizer
def save_model(args, batch_idx, text_encoder, image_encoder, optimizer, ckpt_path):
print('save retrieval model to:', ckpt_path)
ckpt = {
'args': args,
'batch_idx': batch_idx,
'text_encoder': text_encoder.state_dict(),
'image_encoder': image_encoder.state_dict(),
'optimizer': optimizer.state_dict(),
}
torch.save(ckpt, ckpt_path)
# hinge loss
def compute_loss(txt_feat, img_feat, device='cuda'):
BS = txt_feat.shape[0]
denom = img_feat.norm(p=2, dim=1, keepdim=True) @ txt_feat.norm(p=2, dim=1, keepdim=True).t()
numer = img_feat @ txt_feat.t()
sim = numer / (denom + 1e-12)
margin = 0.3 * torch.ones_like(sim)
mask = torch.eye(margin.shape[0], margin.shape[1]).bool().to(device)
margin.masked_fill_(mask, 0)
pos_sim = (torch.diag(sim) * torch.ones(BS, BS).to(device)).t() # [BS, BS]
loss_retrieve_txt = torch.max(
torch.tensor(0.0).to(device),
margin + sim - pos_sim)
loss_retrieve_img = torch.max(
torch.tensor(0.0).to(device),
margin + sim.t() - pos_sim)
loss = loss_retrieve_img + loss_retrieve_txt
# effective number of pairs is BS*BS-BS, those on the diagnal are never counted and always zero
loss = loss.sum() / (BS*BS-BS) / 2.0
return loss
def train(args, start_batch_idx, text_encoder, image_encoder, optimizer, train_loader, device='cuda'):
if args.loss_type == 'hinge':
criterion = compute_loss
elif args.loss_type == 'hardmining+hinge':
triplet_loss = TripletLoss(margin=args.margin)
elif args.loss_type == 'dynamic_soft_margin':
criterion = DynamicSoftMarginLoss(is_binary=False, nbins=args.batch_size // 2)
criterion = criterion.to(device)
#####################
# train
#####################
wandb.init(project="cookgan_retrieval_model")
wandb.config.update(args)
# create process bar
pbar = range(args.batches)
pbar = tqdm(pbar, initial=start_batch_idx, dynamic_ncols=True, smoothing=0.3)
text_encoder.train()
image_encoder.train()
if device == 'cuda':
text_module = text_encoder.module
image_module = image_encoder.module
else:
text_module = text_encoder
image_module = image_encoder
train_loader = sample_data(train_loader)
for batch_idx in pbar:
# test = next(train_loader)
txt, img = next(train_loader)
for i in range(len(txt)):
txt[i] = txt[i].to(device)
img = img.to(device)
# print('shape of txt: ', len(txt))
txt_feat = text_encoder([txt])
img_feat = image_encoder(img)
bs = img.shape[0]
if args.loss_type == 'hinge':
loss = criterion(img_feat, txt_feat, device)
elif args.loss_type == 'hardmining+hinge':
label = list(range(0, bs))
label.extend(label)
label = np.array(label)
label = torch.tensor(label).long().to(device)
loss = global_loss(triplet_loss, torch.cat((img_feat, txt_feat)), label, normalize_feature=True)[0]
elif args.loss_type == 'dynamic_soft_margin':
out = torch.cat((img_feat, txt_feat))
loss = criterion(out)
optimizer.zero_grad() # clear
print(loss)
loss.backward()
# loss.backward(retain_graph=True)
optimizer.step() # update
wandb.log({
'training loss': loss,
'batch_idx': batch_idx
})
if batch_idx % 10_000 == 0:
ckpt_path = f'{wandb.run.dir}/{batch_idx:>08d}.ckpt'
save_model(args, batch_idx, text_module, image_module, optimizer, ckpt_path)
if __name__ == '__main__':
##############################
# setup
##############################
args = get_parser().parse_args()
torch.manual_seed(args.seed)
np.random.seed(args.seed)
torch.backends.cudnn.benchmark = True
device = args.device
##############################
# dataset
##############################
print('loading datasets')
train_set = Dataset(
part='train',
recipe_file=args.recipe_file,
img_dir=args.img_dir,
word2vec_file=args.word2vec_file,
transform=train_transform,
permute_ingrs=args.permute_ingrs)
if args.debug:
print('in debug mode')
train_set = torch.utils.data.Subset(train_set, range(2000))
train_loader = DataLoader(
train_set, batch_size=args.batch_size, shuffle=True,
num_workers=args.workers, pin_memory=True, drop_last=False)
print('train data:', len(train_set), len(train_loader)) # 281161 8787
##########################
# model
##########################
if args.ckpt_path:
ckpt_args, batch_idx, text_encoder, image_encoder, optimizer = load_model(args.ckpt_path, device)
else:
text_encoder, image_encoder, optimizer = create_model(args, device)
batch_idx = 0
train(args, batch_idx, text_encoder, image_encoder, optimizer, train_loader, device='cuda')
# models_retrieval.py
import json
import os
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn.utils import rnn
from torchvision import models
from gensim.models.keyedvectors import KeyedVectors
import pdb
def clean_state_dict(state_dict):
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in state_dict.items():
name = k[7:] if k[:min(6,len(k))] == 'module' else k # remove `module.`
new_state_dict[name] = v
return new_state_dict
class AttentionLayer(nn.Module):
def __init__(self, input_dim):
super(AttentionLayer, self).__init__()
self.u = torch.nn.Parameter(torch.randn(input_dim)) # u = [2*hid_dim] a shared contextual vector
# torch.randn 平均が 0 で分散が 1 の正規分布(標準正規分布とも呼ばれます)から乱数で満たされたテンソルを返す
# torch.nn.parameter レイヤーのパラメータとして定義する
self.u.requires_grad = True
self.fc = nn.Linear(input_dim, input_dim)
def forward(self, x):
# x = [BS, num_vec, 2*hid_dim]
mask = (x!=0)
# a trick used to find the mask for the softmax
mask = mask[:,:,0].bool()
h = torch.tanh(self.fc(x)) # h = [BS, num_vec, 2*hid_dim]
tmp = h @ self.u # tmp = [BS, num_vec], unnormalized importance
masked_tmp = tmp.masked_fill(~mask, -1e32)
alpha = F.softmax(masked_tmp, dim=1) # alpha = [BS, num_vec], normalized importance
alpha = alpha.unsqueeze(-1) # alpha = [BS, num_vec, 1]
out = x * alpha # out = [BS, num_vec, 2*hid_dim]
out = out.sum(dim=1) # out = [BS, 2*hid_dim]
# pdb.set_trace()
return out
class InstEmbedLayer(nn.Module):
def __init__(self, data_dir, emb_dim):
super(InstEmbedLayer, self).__init__()
self.data_dir = data_dir
path = os.path.join(self.data_dir, 'word2vec_recipes.bin')
# model = KeyedVectors.load_word2vec_format(path, binary=True)
wv = KeyedVectors.load(path, mmap='r')
vec = torch.from_numpy(wv.vectors).float()
# first three index has special meaning, see utils.py
emb = nn.Embedding(vec.shape[0]+3, vec.shape[1], padding_idx=0)
emb.weight.data[3:].copy_(vec)
for p in emb.parameters():
p.requires_grad = False
self.embed_layer = emb
print('==> Inst embed layer', emb)
def forward(self, sent_list): # 14557, 300 -> 35549, 300
# sent_list [BS, max_len]
# print(self.embed_layer(sent_list).shape) torch.Size([64, 20, 300])
return self.embed_layer(sent_list) # x=[BS, max_len, emb_dim]
class IngrEmbedLayer(nn.Module):
def __init__(self, data_dir, emb_dim):
super(IngrEmbedLayer, self).__init__()
path = os.path.join(data_dir, 'vocab_ingr.txt')
with open(path, 'r') as f:
num_ingr = len(f.read().split('\n'))
print('num_ingr = ', num_ingr)
# first three index has special meaning, see utils.py
emb = nn.Embedding(35549, emb_dim, padding_idx=0)
# emb = nn.Embedding(num_ingr+3, emb_dim, padding_idx=0)
self.embed_layer = emb
print('==> Ingr embed layer', emb)
def forward(self, sent_list): # 1992, 300
# sent_list [BS, max_len] 64, 20
return self.embed_layer(sent_list) # x=[BS, max_len, emb_dim]
class SentEncoder(nn.Module):
def __init__(
self,
data_dir,
emb_dim,
hid_dim,
with_attention=True,
source='inst'):
assert source in ('inst', 'ingr')
super(SentEncoder, self).__init__()
if source=='inst':
self.embed_layer = InstEmbedLayer(data_dir=data_dir, emb_dim=emb_dim)
elif source=='ingr':
self.embed_layer = IngrEmbedLayer(data_dir=data_dir, emb_dim=emb_dim)
self.rnn = nn.LSTM(
input_size=emb_dim,
hidden_size=hid_dim,
bidirectional=True,
batch_first=True)
if with_attention:
self.atten_layer = AttentionLayer(2*hid_dim)
self.with_attention = with_attention
def forward(self, sent_list):
# sent_list [BS, max_len]
x = self.embed_layer(sent_list) # x=[BS, max_len, emb_dim]
# print(sent_list)
# lens = (sent_list==1).nonzero()[:,1] + 1
lens = sent_list.count_nonzero(dim=1) + 1
# IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
# print(lens.shape)
sorted_len, sorted_idx = lens.sort(0, descending=True) # sorted_idx=[BS], for sorting
_, original_idx = sorted_idx.sort(0, descending=False) # original_idx=[BS], for unsorting
# print(sorted_idx.shape, x.shape)
index_sorted_idx = sorted_idx.view(-1,1,1).expand_as(x) # sorted_idx=[BS, max_len, emb_dim]
sorted_inputs = x.gather(0, index_sorted_idx.long()) # sort by num_words
packed_seq = rnn.pack_padded_sequence(
sorted_inputs, sorted_len.cpu().numpy(), batch_first=True)
if self.with_attention:
out, _ = self.rnn(packed_seq)
y, _ = rnn.pad_packed_sequence(
out, batch_first=True) # y=[BS, max_len, 2*hid_dim], currently in WRONG order!
unsorted_idx = original_idx.view(-1,1,1).expand_as(y)
output = y.gather(0, unsorted_idx).contiguous() # [BS, max_len, 2*hid_dim], now in correct order
feat = self.atten_layer(output)
else:
_, (h,_) = self.rnn(packed_seq) # [2, BS, hid_dim], currently in WRONG order!
h = h.transpose(0,1) # [BS, 2, hid_dim], still in WRONG order!
# unsort the output
unsorted_idx = original_idx.view(-1,1,1).expand_as(h)
output = h.gather(0, unsorted_idx).contiguous() # [BS, 2, hid_dim], now in correct order
feat = output.view(output.size(0), output.size(1)*output.size(2)) # [BS, 2*hid_dim]
# print('sent', feat.shape) # [BS, 2*hid_dim]
return feat
class SentEncoderFC(nn.Module):
def __init__(
self,
data_dir,
emb_dim,
hid_dim,
with_attention=True,
source='inst'):
assert source in ('inst', 'ingr')
super(SentEncoderFC, self).__init__()
if source=='inst':
self.embed_layer = InstEmbedLayer(data_dir=data_dir, emb_dim=emb_dim)
elif source=='ingr':
self.embed_layer = IngrEmbedLayer(data_dir=data_dir, emb_dim=emb_dim)
self.fc = nn.Linear(emb_dim, 2*hid_dim)
if with_attention:
self.atten_layer = AttentionLayer(2*hid_dim)
self.with_attention = with_attention
def forward(self, sent_list):
# sent_list [BS, max_len]
x = self.embed_layer(sent_list) # x=[BS, max_len, emb_dim]
x = self.fc(x) # [BS, max_len, 2*hid_dim]
if not self.with_attention:
feat = x.sum(dim=1) # [BS, 2*hid_dim]
else:
feat = self.atten_layer(x) # [BS, 2*hid_dim]
# print('ingredients', feat.shape)
return feat
class DocEncoder(nn.Module):
def __init__(self, sent_encoder, hid_dim, with_attention):
super(DocEncoder, self).__init__()
self.sent_encoder = sent_encoder
self.rnn = nn.LSTM(
input_size=2*hid_dim,
hidden_size=hid_dim,
bidirectional=True,
batch_first=True)
self.atten_layer_sent = AttentionLayer(2*hid_dim)
self.with_attention = with_attention
def forward(self, doc_list):
# doc_list=[BS, max_len, max_len]
embs = []
lens = []
for doc in doc_list:
len_doc = doc.nonzero()[:,0].max().item() + 1
lens.append(len_doc)
emb_doc = self.sent_encoder(doc[:len_doc]) # [?, 2*hid_dim]
embs.append(emb_doc)
embs = sorted(embs, key=lambda x: -x.shape[0]) # [BS, [?, 2*hid_dim]]
packed_seq = rnn.pack_sequence(embs)
lens = torch.tensor(lens).long().to(embs[0].device)
_, sorted_idx = lens.sort(0, descending=True) # sorted_idx=[BS], for sorting
_, original_idx = sorted_idx.sort(0, descending=False) # original_idx=[BS], for unsorting
if not self.with_attention:
_, (h,_) = self.rnn(packed_seq) # [2, BS, hid_dim], currently in WRONG order!
h = h.transpose(0,1) # [BS, 2, hid_dim], still in WRONG order!
# unsort the output
unsorted_idx = original_idx.view(-1,1,1).expand_as(h)
output = h.gather(0, unsorted_idx).contiguous() # [BS, 2, hid_dim], now in correct order
feat = output.view(output.size(0), output.size(1)*output.size(2)) # [BS, 2*hid_dim]
else:
out, _ = self.rnn(packed_seq)
y, _ = rnn.pad_packed_sequence(
out, batch_first=True) # y=[BS, max_valid_len, 2*hid_dim], currently in WRONG order!
unsorted_idx = original_idx.view(-1,1,1).expand_as(y)
output = y.gather(0, unsorted_idx).contiguous() # [BS, 2, hid_dim], now in correct order
feat = self.atten_layer_sent(output)
# print('instructions', feat.shape)
return feat
class TextEncoder(nn.Module):
def __init__(
self, data_dir, text_info, hid_dim, emb_dim, z_dim, with_attention, ingr_enc_type):
super(TextEncoder, self).__init__()
self.text_info = text_info
if self.text_info == '111':
self.sent_encoder = SentEncoder(
data_dir, # 300
emb_dim, # 300
hid_dim, # 1024
with_attention,
source='inst')
self.doc_encoder = DocEncoder(
self.sent_encoder,
hid_dim,
with_attention
)
if ingr_enc_type=='rnn':
self.ingr_encoder = SentEncoder(
data_dir,
emb_dim,
hid_dim,
with_attention,
source='ingr')
elif ingr_enc_type == 'fc':
self.ingr_encoder = SentEncoderFC(
data_dir,
emb_dim,
hid_dim,
with_attention,
source='ingr')
self.bn = nn.BatchNorm1d((2+2+2)*hid_dim)
self.fc = nn.Linear((2+2+2)*hid_dim, z_dim)
elif self.text_info == '010':
if ingr_enc_type=='rnn':
self.ingr_encoder = SentEncoder(
data_dir,
emb_dim,
hid_dim,
with_attention,
source='ingr')
elif ingr_enc_type == 'fc':
self.ingr_encoder = SentEncoderFC(
data_dir,
emb_dim,
hid_dim,
with_attention,
source='ingr')
self.bn = nn.BatchNorm1d(2*hid_dim)
self.fc = nn.Linear(2*hid_dim, z_dim)
def forward(self, recipe_list):
title_list = recipe_list[0][0]
ingredients_list = recipe_list[0][2]
instructions_list = recipe_list[0][4]
# title_list, ingredients_list, instructions_list = recipe_list
if self.text_info == '111':
feat_title = self.sent_encoder(title_list)
feat_ingredients = self.ingr_encoder(ingredients_list)
feat_instructions = self.doc_encoder(instructions_list)
feat = torch.cat([feat_title, feat_ingredients, feat_instructions], dim=1)
feat = torch.tanh(self.fc(self.bn(feat)))
elif self.text_info == '010':
feat_ingredients = self.ingr_encoder(ingredients_list)
feat = torch.tanh(self.fc(self.bn(feat_ingredients)))
# print('recipe', feat.shape)
return feat
class Resnet(nn.Module):
def __init__(self, ckpt_path=None):
super(Resnet, self).__init__()
resnet = models.resnet50(pretrained=False)
num_feat = resnet.fc.in_features
resnet.fc = nn.Linear(num_feat, 101)
if ckpt_path:
resnet.load_state_dict(clean_state_dict(torch.load(ckpt_path)))
modules = list(resnet.children())[:-1] # we do not use the last fc layer.
self.encoder = nn.Sequential(*modules)
def forward(self, image_list):
BS = image_list.shape[0]
return self.encoder(image_list).view(BS, -1)
class ImageEncoder(nn.Module):
def __init__(self, z_dim, ckpt_path=None):
super(ImageEncoder, self).__init__()
self.resnet = Resnet(ckpt_path)
self.bottleneck = nn.Sequential(
nn.BatchNorm1d(2048),
nn.Linear(2048, z_dim),
nn.Tanh()
)
def forward(self, image_list):
feat = self.resnet(image_list)
feat = self.bottleneck(feat)
# print('image', feat.shape)
return feat
TextEncoder有被修改过,因为输入的数据recipe_list长度为7,改成了其中对应的标题、食材和操作:
# title_list, ingredients_list, instructions_list = recipe_list
title_list = recipe_list[0][0]
ingredients_list = recipe_list[0][2]
instructions_list = recipe_list[0][4]
运行结果及报错内容
0%| | 0/400000 [00:00<?, ?it/s]tensor(0.5797, device='cuda:0', grad_fn=<MeanBackward0>)
save retrieval model to: E:\CookGAN\retrieval_model\wandb\run-20220824_153726-s0ojwz5g\files/00000000.ckpt
0%| | 1/400000 [00:08<930:13:11, 8.37s/it]tensor(0.4852, device='cuda:0', grad_fn=<MeanBackward0>)
Traceback (most recent call last):
File "E:/CookGAN/retrieval_model/train_retrieval.py", line 206, in <module>
train(args, batch_idx, text_encoder, image_encoder, optimizer, train_loader, device='cuda')
File "E:/CookGAN/retrieval_model/train_retrieval.py", line 153, in train
loss.backward()
File "C:\Users\10706\AppData\Local\Programs\Python\Python37\lib\site-packages\torch\_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "C:\Users\10706\AppData\Local\Programs\Python\Python37\lib\site-packages\torch\autograd\__init__.py", line 175, in backward
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
IndexError: select(): index 20 out of range for tensor of size [20, 32, 300] at dimension 0
此处的[20, 32, 300]分别代表:
- embedding的输入数据的最大长度
- batch size
- embedding的emb_dim
尝试过batchsize不变同时缩小batch size,执行次数随着batchsize减小而增加,只能猜测是输入数据问题......
有大佬可以帮忙吗?