用深度学习lstm模型做两类微博文本数据的情感分析,模型训练时为什么准确率一直在50%上下波动,损失值在0.69?下面是定义的模型:
class SentimentNet(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pad_idx):
super(SentimentNet, self).__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text, hidden):
embedded = self.embedding(text)
lstm_output, hidden = self.lstm(embedded, hidden)
avg_pool = lstm_output.mean(dim=1)
out = self.dropout(avg_pool)
out = self.fc(out)
sig_out = torch.sigmoid(out)
return sig_out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(1, batch_size, self.lstm.hidden_size).zero_(),
weight.new(1, batch_size, self.lstm.hidden_size).zero_())
return hidden
def load_pretrained_embeddings(self, word2vec_model, word2idx):
with open('word2idx.json', 'r', encoding='utf-8') as f:
word2idx = json.load(f)
pretrained_weights = torch.zeros(len(word2idx), self.embedding.embedding_dim)
for idx, word in word2idx.items():
word = word2idx.get(word)
if word in word2vec_model:
pretrained_weights[idx, :] = torch.from_numpy(word2vec_model[word])
self.embedding.weight.data.copy_(pretrained_weights)
self.embedding.weight.requires_grad = False
**以下是做的数据处理部分:**
# 1. 读取数据
df = pd.read_csv('data/raw_data/weibo_senti_100k.csv')
# 2. 去除缺失值和重复数据
df.dropna(inplace=True) # 删除包含NaN的行
df.drop_duplicates(inplace=True) # 删除重复行
# 2. 文本清洗
def clean_text(text):
# 去除HTML标签(如果有的话,可以使用正则表达式或第三方库如BeautifulSoup)
text = re.sub('<[^>]*>', '', text)
# 去除特殊字符和标点符号(可以自定义需要保留的字符)
text = re.sub(r'[^\u4e00-\u9fa5\w\s]', '', text) # 只保留中文字符、字母、数字和空格
# 去除多余空格和换行符
text = text.replace('\n', '').replace('\r', '').replace(' ', ' ').strip()
if '广告' in text or '推销' in text:
return None
return text
df['review'] = df['review'].apply(clean_text)
df = df.dropna(subset=['review']) # 再次删除因为清洗而出现的NaN值
# 加载停用词列表
with open('data/tool/stoplist.txt', 'r', encoding='utf-8') as f:
stopwords = set(f.read().splitlines())
# 去除停用词的函数
def remove_stopwords(text):
words = [word for word in jieba.cut(text) if word not in stopwords]
return words
df['review'] = df['review'].apply(remove_stopwords)
# 4. 构建词汇表
max_words = 3000 # 假设我们只保留最常见的3000个词
tokenizer = Tokenizer(num_words=max_words, filters='') # filters='' 是为了防止jieba分词后的空格被过滤掉
tokenizer.fit_on_texts(df['review'].apply(lambda x: ' '.join(x)))
# 5. 文本数值化
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df['review'].apply(lambda x: ' '.join(x)))
# 6. 序列填充或截断
max_sequence_length = 64 # 假设所有文本序列都将被截断或填充到这个长度
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
texts = np.array(padded_sequences)
labels = np.array(df['label'])
# 保存到文件
np.save('data/npy_data/processed_texts.npy', texts)
np.save('data/npy_data/processed_labels.npy', labels)
# 已经下载了Word2Vec模型
word2vec_model = KeyedVectors.load_word2vec_format('path/word2vec/word2vec_779845.bin', binary=True)
# 使用defaultdict来构建word2idx字典
word2idx = defaultdict(lambda: len(word2idx))
word2idx = {'<pad>': 0,'<UNK>':1}
# 遍历分词后的数据,并将单词添加到word2idx字典中
for sentence in padded_sequences:
for word in sentence:
word2idx[word] = len(word2idx)
# 获取词汇表大小(即单词到索引映射中的单词数量)
vocab_size = len(word2idx)
word2idx_as_python_ints = {str(k): v for k, v in word2idx.items()}
# 将 word2idx 字典保存到 JSON 文件中
with open('word2idx.json', 'w', encoding='utf-8') as f:
json.dump(word2idx_as_python_ints, f, ensure_ascii=False, indent=4)
**下面是进行模型训练的部分:**
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader,random_split, TensorDataset
from sklearn.model_selection import train_test_split
from model import SentimentNet
from gensim.models import KeyedVectors
from model_2 import SentimentNet
import json
import pandas as pd
with open('word2idx.json', 'r', encoding='utf-8') as f:
word2idx = json.load(f)
# 加载预处理后的数据
np_texts = np.load('data/npy_data/processed_texts.npy')
np_labels = np.load('data/npy_data/processed_labels.npy')
# 将NumPy数组转换为PyTorch张量
texts = torch.from_numpy(np_texts).long() # 假设texts是整数索引
labels = torch.from_numpy(np_labels).float() # 假设labels是浮点数
# 创建一个TensorDataset
dataset = TensorDataset(texts, labels)
# 数据集划分逻辑
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
# 使用random_split划分数据集
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
# 加载预训练的Word2Vec模型
word2vec_model = KeyedVectors.load_word2vec_format('path/word2vec/word2vec_779845.bin', binary=True)
vocab_size = len(word2idx) # 词汇表大小
embedding_dim = 400 # 嵌入维度
hidden_dim = 128 # LSTM隐藏层维度
output_dim = 1 # 输出维度
dropout = 0.5 # dropout率
pad_idx = word2idx['<pad>']
# 初始化模型
model = SentimentNet(vocab_size, embedding_dim, hidden_dim, output_dim, dropout, pad_idx, bn=True)
# 加载预训练的词嵌入
model.load_pretrained_embeddings(word2vec_model, word2idx)
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 训练模型
num_epochs = 10
for epoch in range(num_epochs):
model.train() # 设置模型为训练模式
running_loss = 0.0
corrects = 0
total = 0
for texts, labels in train_loader:
# 初始化隐藏状态
hidden = model.init_hidden(texts.size(0))
# 前向传播
outputs, hidden = model(texts, hidden)
# 阈值化预测
preds = (outputs.squeeze() >= 0.5).float()
# 计算损失
loss = criterion(outputs.squeeze(), labels)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 累积统计信息
running_loss += loss.item() * texts.size(0)
corrects += torch.sum(preds == labels.data)
total += labels.size(0)
# 计算epoch的平均损失和准确率
epoch_loss = running_loss / total
epoch_acc = corrects.double() / total
# 打印统计信息
print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Acc: {epoch_acc * 100:.2f}%')
打印出的结果为什么准确率一直在50%上下波动,损失值在0.69?(下图是打印结果)怎么解决这个问题呢?要修改上面的代码吗?救命!快来个人帮帮我吧!非常感谢!(我的原始数据标签为1的有59993条,标签为0的有59995条,经数据预处理后的标签为1的数据数量: 59650,标签为0的数据数量: 59662)