为什么我训练过程在训练一个LSTM神经网络的时候,train_accuracy和val_accuracy已经达到了0.95,训练完成以后用得到的东西检测我训练用的训练集的时侯,发现准确率反而下降至0.55.按道理如果是过拟合的话不应该很高吗?
这个是训练用的文件
import sys
sys.path.append('../input')
import torch
import torch.optim as optim
import torch.nn as nn
from bighomework.bighomework.data_utils import load_data, TextDataset, collate_fn
from bighomework.bighomework.model import LSTMClassifier
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from bighomework.bighomework.draw_picture import draw_val_accuracies,draw_train_accuracies,draw_train_losses,draw_val_losses
from sklearn.model_selection import train_test_split
import pickle
# 定义超参数
embedding_size = 100
hidden_size = 256
output_size = 2
num_layers = 2
dropout = 0.5
batch_size = 128
epochs = 3
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')
# 导入数据并进行分词
train_texts, train_labels = load_data('/kaggle/input/datasets4/datasets4/full_ai.txt','/kaggle/input/datasets4/datasets4/full_human.txt')
# 使用简单的空格分词
def tokenizer(text):
return text.split()
# 构建词汇表
def yield_tokens(data_iter, tokenizer):
for text in data_iter:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(full_texts, tokenizer), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
with open('/kaggle/working/vocab.pkl', 'wb') as f:
pickle.dump(vocab, f)
# 划分训练集和验证集
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=42)
# 创建训练集和验证集的数据集和数据加载器
train_dataset = TextDataset(train_texts, train_labels, vocab, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn)
val_dataset = TextDataset(val_texts, val_labels, vocab, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
# 导入模型
model = LSTMClassifier(vocab_size, embedding_size, hidden_size, output_size, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# 训练和验证
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []
print(f'Number of batches per epoch: {len(train_loader)}')
for epoch in range(epochs):
# 训练阶段
model.train()
epoch_train_loss = 0
correct_train_predictions = 0
total_train_samples = 0
i=0
for texts, labels, lengths in train_loader:
# print("train:", i+1, "of", len(train_loader), "batches")
texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
optimizer.zero_grad() # 清除之前计算的梯度。
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # 对模型的所有参数进行梯度裁剪,防止梯度爆炸。
predictions = model(texts, lengths).squeeze(1) # 使用模型对输入的文本进行预测
loss = criterion(predictions, labels) # 计算预测结果与实际标签之间的损失。
loss.backward() # 反向传播,计算梯度
optimizer.step() # 更新模型参数。
epoch_train_loss += loss.item()
_, predicted_labels = torch.max(predictions, 1)
correct_train_predictions += (predicted_labels == labels).sum().item()
total_train_samples += labels.size(0)
i+=1
avg_train_loss = epoch_train_loss / len(train_loader)
train_accuracy = correct_train_predictions / total_train_samples
train_losses.append(avg_train_loss)
train_accuracies.append(train_accuracy)
print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}')
# 验证阶段
model.eval()
epoch_val_loss = 0
correct_val_predictions = 0
total_val_samples = 0
with torch.no_grad():
for texts, labels, lengths in val_loader:
texts, labels, lengths = texts.to(device), labels.to(device), lengths.to(device)
predictions = model(texts, lengths).squeeze(1)
loss = criterion(predictions, labels)
epoch_val_loss += loss.item()
_, predicted_labels = torch.max(predictions, 1)
correct_val_predictions += (predicted_labels == labels).sum().item()
total_val_samples += labels.size(0)
avg_val_loss = epoch_val_loss / len(val_loader)
val_accuracy = correct_val_predictions / total_val_samples
val_losses.append(avg_val_loss)
val_accuracies.append(val_accuracy)
print(f'Epoch {epoch + 1}/{epochs}, Val Loss: {avg_val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
# 保存模型
torch.save(model.state_dict(), '/kaggle/working/lstm_classifier6.pth')
# 绘制损失和准确率曲线
draw_train_losses(epochs, train_losses)
draw_val_losses(epochs,val_losses)
draw_train_accuracies(epochs, train_accuracies)
draw_val_accuracies(epochs,val_accuracies)
这个是检测用的文件
import sys
sys.path.append('../input')
import torch
from bighomework.bighomework.model import LSTMClassifier
from bighomework.bighomework.data_utils import TextDataset, collate_fn, load_data
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import numpy as np
import pickle
# 定义超参数
embedding_size = 100
hidden_size = 256
output_size = 2
num_layers = 2
dropout = 0.5
batch_size = 128
epochs = 50
learning_rate = 0.001
# 检查是否有可用的GPU设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')
# 使用简单的空格分词
def tokenizer(text):
return text.split()
# 直接生成分词后的迭代器
def yield_tokens(data_iter, tokenizer):
for text in data_iter:
yield tokenizer(text)
def load_vocab(vocab_path):
with open(vocab_path, 'rb') as f:
vocab = pickle.load(f)
return vocab
# 加载模型
def load_model(vocab_size, embedding_size, hidden_size, output_size, num_layers, dropout, model_path):
model = LSTMClassifier(vocab_size, embedding_size, hidden_size, output_size, num_layers, dropout)
model.load_state_dict(torch.load(model_path))
model.to(device) # 将模型移动到GPU上
return model
def detect_AI(text, model, vocab, tokenizer):
# 对文本进行分词和编码
tokens = tokenizer(text)
indexed = [vocab[token] for token in tokens]
tensor = torch.LongTensor(indexed).to(device) # 将数据移动到GPU上
tensor = tensor.unsqueeze(0) # 添加 batch 维度
# 使用模型进行预测
model.eval()
with torch.no_grad():
output = model(tensor, torch.tensor([len(tensor)], device=device)) # 确保长度tensor也在同样设备上
# 获取预测结果概率
probabilities = torch.nn.functional.softmax(output, dim=1)[0]
return probabilities[0]
if __name__ == "__main__":
vocab_path = "/kaggle/working/vocab2.pkl" # 替换为实际的路径
vocab = load_vocab(vocab_path)
vocab_size = len(vocab)
# 加载模型
model_path = "/kaggle/working/lstm_classifier8.pth"
model = load_model(vocab_size, embedding_size, hidden_size, output_size, num_layers, dropout, model_path)
with open('/kaggle/input/datasets4/datasets4/test_ai.txt','r',encoding='utf-8') as file1,open('/kaggle/input/datasets4/datasets4/test_human.txt','r',encoding='utf-8') as file2:
ai_lines=file1.readlines()
human_lines=file2.readlines()
i,score=0,0
for line in ai_lines:
res=detect_AI(line,model,vocab,tokenizer)
if res>0.5:
score+=1
i+=1
print(i,score)
j,right=0,0
for line in human_lines:
res=detect_AI(line,model,vocab,tokenizer)
if res<0.5:
right+=1
j+=1
print(j,right)
print("综合准确率:",(right+score)/(i+j))
with open('/kaggle/input/datasets4/datasets4/train_ai.txt','r',encoding='utf-8') as file1,open('/kaggle/input/datasets4/datasets4/train_human.txt','r',encoding='utf-8') as file2:
ai_lines=file1.readlines()
human_lines=file2.readlines()
i,score=0,0
for line in ai_lines:
res=detect_AI(line,model,vocab,tokenizer)
if res>0.5:
score+=1
i+=1
print(i,score)
j,right=0,0
for line in human_lines:
res=detect_AI(line,model,vocab,tokenizer)
if res<0.5:
right+=1
j+=1
print(j,right)
print("综合准确率:",(right+score)/(i+j))
# 读取 test.csv 文件
test_df = pd.read_csv("/kaggle/input/llm-detect-ai-generated-text/test_essays.csv")
results = []
for index, row in test_df.iterrows():
text = row['text']
result = detect_AI(text, model, vocab, tokenizer)
results.append(result)
# 将张量移动到 CPU 并转换为 NumPy 数组
results_np = [tensor.cpu().numpy() for tensor in results]
# 假设 'test_df' 是包含 'id' 列的 DataFrame
submission_df = pd.DataFrame({'id': test_df['id'], 'label': results_np})
# 写入 CSV 文件
submission_df.to_csv("/kaggle/working/submission.csv", index=False)
print("结果已保存到 submission.csv")