在深度学习IMDB情感分析中,我搭建了LSTM神经网络,但是预测值全为0,检查了好几天没有发现是什么原因。。
我认为数据预处理的部分应该是没有太大问题的,应该是神经网络搭建出了问题,但是我不清楚问题出在哪
以下是代码,请大家帮忙查看是哪里出了问题:
import torch
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import os
import nltk
import torch.utils.data as data
import torch.optim as optim
from torch import nn
#加载数据集并分词
def openFiles(path,stopwords):#取前1000个数据训练,标签正面情绪为1负面情绪为0
files=os.listdir(path)[:1000]
res=[]
label = 0 if path.split('/')[-1]=='neg' else 1
for file in files:
'''添加标签,分词并去除停用词后加入res中'''
temp=[label]
with open(path+'/'+file,encoding='utf-8') as f:
words=nltk.word_tokenize(f.read())
words=[word for word in words if word.lower() not in stopwords]
temp.append(words)
res.append(temp)
return res
stopwords = set(nltk.corpus.stopwords.words('english'))#加载停用词
train=openFiles('./aclImdb/train/neg',stopwords)+openFiles('./aclImdb/train/pos',stopwords)
test=openFiles('./aclImdb/test/neg',stopwords)+openFiles('./aclImdb/test/pos',stopwords)
print('done')
#词序列化并填充
temp={}
word_dic={}
word_dic['<PAD>']=0
word_dic['<UNK>']=1
for i in train:
for word in i[1]:
if word not in temp and len(word)>1 and word.isalnum() and word!='br':
temp[word]=1
elif word in temp and len(word)>1 and word.isalnum() and word!='br':
temp[word]+=1
temp=dict(sorted(temp.items(),key=lambda x:x[1],reverse=True))
for i in temp.keys():
if temp[i]>10:
word_dic[i]=len(word_dic)
max_len=50
print(f'len dic={len(word_dic)}')
def padding(data,pad_num):
for i in range(len(data)):
if len(data[i][1])<pad_num:
data[i][1]+=(['<PAD>']*(pad_num-len(data[i][1])))
else:
data[i][1]=data[i][1][:pad_num]
return data
train_data=padding(train,max_len)
test_data=padding(test,max_len)
print('padding done')
def word2num(data,word_dic):
for i in range(len(data)):
for j in range(len(data[i][1])):
if data[i][1][j] in word_dic.keys():
data[i][1][j]=word_dic[data[i][1][j]]
else:
data[i][1][j]=word_dic['<UNK>']
return data
train_data=word2num(train_data,word_dic)
test_data=word2num(test_data,word_dic)
print('word2num done')
#划分数据集
class MyDataset(data.Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, index):
label, feature = self.data[index]
return torch.tensor(feature), torch.tensor(label)
train_dataset = MyDataset(train_data)
test_dataset = MyDataset(test_data)
#建立模型并训练
class ComEmoClass(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim,padding_idx=0)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
bidirectional=bidirectional, dropout=dropout)
self.fc = nn.Linear(hidden_dim*2, output_dim)
def forward(self, text):
embedded = self.embedding(text).squeeze(1)
print(embedded.shape)
output,_= self.lstm(embedded)
output=output[:,-1,:]
return self.fc(output.squeeze(0))
vocab_size = len(word_dic)
embedding_dim = 10
hidden_dim = 50
output_dim = 1
n_layers = 1
bidirectional = True
# 初始化模型
model = ComEmoClass(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional,dropout=0)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
# 实例化数据加载器
train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=True)
num_epochs=10
for epoch in range(num_epochs):
for i, (inputs, labels) in enumerate(train_loader):
# 将数据送入模型进行训练
labels=labels.unsqueeze(dim=1).float()
#print(labels)
inputs=inputs.unsqueeze(dim=1)#.permute(0,2,1)
#print(inputs.shape)
outputs = model(inputs)
print(outputs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 打印每一次迭代的训练结果
print('Epoch [{}/{}], Step [{}/{}], Loss: {}'.format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))
训练跑出来这样的结果: