qq_56900003 2023-12-28 13:59 采纳率: 0%
浏览 10

使用maml训练,但是准确率不变

我想使用ALBERT和孪生网络来训练一个主观问题评分模型,孪生网络由双向LSTM和全连接层组成。在训练中,我注意到准确性并没有提高,一直保持不变。我感觉像是权重没有更新,可能是因为梯度太小导致了权重变化不大。或者,训练策略可能存在问题,但我不确定具体原因。下面是我训练期时的准确性:

img

下面是我训练的代码:

class MetaTask(nn.Module):
    def __init__(self, args):
        super(MetaTask, self).__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.loss_fn = nn.CrossEntropyLoss()
        self.update_lr = args.update_lr
        self.meta_lr = args.meta_lr
        self.finetunning_lr = args.finetunning_lr
        self.n_way = args.n_way
        self.k_spt = args.k_spt
        self.k_qry = args.k_qry
        self.task_num = args.task_num
        self.update_step = args.update_step
        self.update_step_test = args.update_step_test
        self.net = SubjectiveGradingModel().to(self.device)
        self.meta_optim = optim.Adam(self.net.parameters(), lr=self.meta_lr)


    def forward(self, support_x, support_y, query_x, query_y):
        task_num = len(support_x)
        querysz = len(query_x[0])
        losses_q = [0 for _ in range(self.update_step + 1)]
        corrects = [0 for _ in range(self.update_step + 1)]
        for i in range(task_num):
            self.net.train()
            # 1. run the i-th task and compute loss for k=0
            logits = self.net(support_x[i])
            loss = self.loss_fn(logits, torch.cat(support_y[i], dim=0).long())
            fast_weights = OrderedDict(self.net.named_parameters())
            grad = torch.autograd.grad(loss, fast_weights.values(), retain_graph=True)
            # 输出梯度为None的参数
            # for (name, param), gra in zip(self.net.named_parameters(), grad):
            #     if gra is None:
            #         print("梯度为None的参数:", name)
            fast_weights = OrderedDict(
                (name, param - self.update_lr * grad)
                for ((name, param), grad) in zip(fast_weights.items(), grad)
            )
            # this is the loss and accuracy before first update
            with torch.no_grad():
                self.net.eval()
                logits_q = self.net(query_x[i])
                loss_q = self.loss_fn(logits_q, torch.cat(query_y[i], dim=0).long())
                losses_q[0] += loss_q
                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                correct = torch.eq(pred_q, torch.cat(query_y[i], dim=0).long()).sum().item()
                corrects[0] = corrects[0] + correct

            # this is the loss and accuracy after the first update
            with torch.no_grad():
                self.net.eval()
                self.net.load_state_dict(fast_weights, strict=False)
                logits_q = self.net(query_x[i])
                loss_q = self.loss_fn(logits_q, torch.cat(query_y[i], dim=0).long())
                losses_q[1] += loss_q

                pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                correct = torch.eq(pred_q, torch.cat(query_y[i], dim=0).long()).sum().item()
                corrects[1] = corrects[1] + correct
            self.net.train()
            for k in range(1, self.update_step):
                # 1. run the i-th task and compute loss for k=1~K-1
                self.net.load_state_dict(fast_weights, strict=False)
                logits = self.net(support_x[i])
                loss = self.loss_fn(logits, torch.cat(support_y[i], dim=0).long())
                # 2. compute grad on theta_pi
                fast_weights = OrderedDict(self.net.named_parameters())
                grad = torch.autograd.grad(loss, fast_weights.values(), retain_graph=True)
                # 3. theta_pi = theta_pi - train_lr * grad
                fast_weights = OrderedDict(
                    (name, param - self.update_lr * grad)
                    for ((name, param), grad) in zip(fast_weights.items(), grad)
                )
                self.net.load_state_dict(fast_weights, strict=False)
                logits_q = self.net(query_x[i])
                # loss_q will be overwritten and just keep the loss_q on last update step.
                loss_q = self.loss_fn(logits_q, torch.cat(query_y[i], dim=0).long())
                losses_q[k + 1] += loss_q

                with torch.no_grad():
                    pred_q = F.softmax(logits_q, dim=1).argmax(dim=1)
                    correct = torch.eq(pred_q, torch.cat(query_y[i], dim=0).long()).sum().item()  # convert to numpy
                    corrects[k + 1] = corrects[k + 1] + correct

        loss_q = losses_q[-1] / task_num
        # optimize theta parameters
        self.meta_optim.zero_grad()
        loss_q.backward(retain_graph=True)
        # print('meta update')
        self.meta_optim.step()
        accs = np.array(corrects) / (querysz * task_num)
        return accs

class SubjectiveGradingModel(nn.Module):
    def __init__(self, hidden_size=384):
        super(SubjectiveGradingModel, self).__init__()

        # 加载预训练的BERT模型和分词器
        self.bert = AlbertModel.from_pretrained('src/datamoudle/model/albert_chinese_small')
        # 孪生网络
        self.siamese_network = Siamese(max_length=378, embedding_size=hidden_size)


    def forward(self, input_data ,weights=None):
        # 将每个字典中的数据拆分成单独的列表
        input_ids_list = [item['input_ids'].squeeze(0).squeeze(0) for item in input_data]
        token_type_ids_list = [item['token_type_ids'].squeeze(0).squeeze(0) for item in input_data]
        attention_mask_list = [item['attention_mask'].squeeze(0).squeeze(0) for item in input_data]
        answer_input_ids_list = [item['answer_input_ids'].squeeze(0).squeeze(0) for item in input_data]
        answer_token_type_ids_list = [item['answer_token_type_ids'].squeeze(0).squeeze(0) for item in input_data]
        answer_attention_mask_list = [item['answer_attention_mask'].squeeze(0).squeeze(0) for item in input_data]

        # 转换成 PyTorch 张量
        input_ids = torch.stack(input_ids_list)
        token_type_ids = torch.stack(token_type_ids_list)
        attention_mask = torch.stack(attention_mask_list)
        answer_input_ids = torch.stack(answer_input_ids_list)
        answer_token_type_ids = torch.stack(answer_token_type_ids_list)
        answer_attention_mask = torch.stack(answer_attention_mask_list)


        outputs = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state
        cls_output = outputs.pooler_output
        outputs_answer = self.bert(input_ids=answer_input_ids, token_type_ids=answer_token_type_ids, attention_mask=answer_attention_mask)
        pooled_output_answer = outputs_answer.last_hidden_state
        cls_output_answer = outputs_answer.pooler_output

        siamese_output = self.siamese_network(pooled_output, pooled_output_answer, cls_output, cls_output_answer)

        return siamese_output

class LSTMEncoder(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, bidir, dropout):
        super(LSTMEncoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidir = bidir
        if self.bidir:
            self.direction = 2
        else: self.direction = 1
        self.dropout = dropout
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.lstm = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, dropout=self.dropout,
                            num_layers=self.num_layers, bidirectional=self.bidir)

    def initHiddenCell(self, batch_size):
        rand_hidden = Variable(torch.randn(self.direction * self.num_layers, batch_size, self.hidden_size, requires_grad=True)).to(self.device)
        rand_cell = Variable(torch.randn(self.direction * self.num_layers, batch_size, self.hidden_size, requires_grad=True)).to(self.device)
        return rand_hidden, rand_cell

    def forward(self, input, hidden, cell):
        output, (hidden, cell) = self.lstm(input, (hidden, cell))
        return output, hidden, cell


class Siamese(nn.Module):
    def __init__(self, max_length, embedding_size):
        super(Siamese, self).__init__()
        self.max_length = max_length

        # 定义第一个 LSTM 编码器
        self.encoder = LSTMEncoder(embed_size=embedding_size, hidden_size=64, num_layers=1, bidir=True,dropout=0.2)

        self.input_dim = 5 * self.encoder.direction * self.encoder.hidden_size

        # 定义一个 classifier 层序列
        self.classifier = nn.Sequential(
            nn.Linear(896, self.input_dim // 2),
            nn.Linear(self.input_dim // 2, 9)
        )




    def forward(self, student_answer_emb, model_answer_emb, v1, v2):
        #model_answer_support = model_answer_emb.repeat(student_answer_emb.size(0), 1, 1)

        # 取BERT输出的CLS token作为表示
        # v1 = student_answer_emb[:, 0, :]
        # v2 = model_answer_emb[:, 0, :]

        # 初始化LSTM的隐藏状态和细胞状态
        h1, c1 = self.encoder.initHiddenCell(batch_size=student_answer_emb.size(0))
        h2, c2 = self.encoder.initHiddenCell(batch_size=model_answer_emb.size(0))

        # 使用LSTM编码器获取序列表示
        _, h1, c1 = self.encoder(student_answer_emb.permute(1, 0, 2), h1, c1)
        _, h2, c2 = self.encoder(model_answer_emb.permute(1, 0, 2), h2, c2)

        # 取LSTM编码的最后一个时间步的隐藏状态作为表示
        lstm_v1 = h1[-1, :, :]
        lstm_v2 = h2[-1, :, :]

        # 利用这两个编码向量
        features = torch.cat((v1,  v2, lstm_v1, lstm_v2), 1)
        # 输入特征到分类器
        output = self.classifier(features)
        output = F.softmax(output, dim=1)
        return output



  • 写回答

1条回答 默认 最新

  • CSDN-Ada助手 CSDN-AI 官方账号 2023-12-28 16:40
    关注

    【相关推荐】



    • 这篇博客: 利用LSTM长短期记忆算法分析猪肉价格未来走势中的 3.2、构造时间序列的输入输出、以及数据标准化 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:
      #参数初始化
      #时间步长
      steps = 30
      #特征数
      features=1
      #构造时间序列的输入及输出,及前n个数据作为X,第n+1个数据为预测结果y
      def create_dataset(data, steps=1):
          dataX, dataY = [], []
          for i in range(len(data)-steps-1):
              a = data[i:(i+steps), 0]
              dataX.append(a)
              dataY.append(data[i + steps, 0])
          return np.array(dataX), np.array(dataY)
      #数据标准化,避免量纲对预测的影响
      
      scaler = MinMaxScaler(feature_range=(0, 1))
      #不能对全部数据fit_transform----------data = scaler.fit_transform(data)
      #print(data)
      #划分训练集和测试集,训练集70%,测试集30%
      train_size = int(len(data) * 0.7)
      test_size = len(data) - train_size
      train,test = data[0:train_size,:], data[train_size:len(data),:]
      #train数据进行fit_transform,先对其训练再进行标准化
      train=scaler.fit_transform(train)
      #test数据只能进行转化,因为测试集是训练后模型用来测试的
      test=scaler.transform(test)
      
      
      #输出data的输入及输出,假设滑动窗口距离为steps,即时间序列的当前数据只受前steps条数据的影响
      #steps = 10
      trainX, trainY = create_dataset(train, steps)
      testX, testY = create_dataset(test, steps)
      #display(testX,testY)
      #展示单变量拆分后的结果
      # for i in range(len(trainX)):
      #     print(trainX[i],trainY[i])
      

    如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^
    评论

报告相同问题?

问题事件

  • 创建了问题 12月28日

悬赏问题

  • ¥500 把面具戴到人脸上,请大家贡献智慧
  • ¥15 任意一个散点图自己下载其js脚本文件并做成独立的案例页面,不要作在线的,要离线状态。
  • ¥15 各位 帮我看看如何写代码,打出来的图形要和如下图呈现的一样,急
  • ¥30 c#打开word开启修订并实时显示批注
  • ¥15 如何解决ldsc的这条报错/index error
  • ¥15 VS2022+WDK驱动开发环境
  • ¥30 关于#java#的问题,请各位专家解答!
  • ¥30 vue+element根据数据循环生成多个table,如何实现最后一列 平均分合并
  • ¥20 pcf8563时钟芯片不启振
  • ¥20 pip2.40更新pip2.43时报错