import os
import re
import string
import math
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
DATA_DIR = r'C:\Users\***\Desktop\team7' # 数据集地址
target_names = ['ham', 'spam'] # 正常、垃圾
stopwords = set(open('stopwords.txt', 'r').read().splitlines()) # 加载停用词
def get_data(DATA_DIR):
# 假设“未分类邮件”是包含所有邮件的文件夹
mail_folder = '待分类邮件'
data = []
target = []
# 获取“未分类邮件”文件夹中的所有文件
all_files = os.listdir(os.path.join(DATA_DIR, mail_folder))
for mail_file in all_files:
# 通过文件夹名称判断邮件类型
if 'spam' in mail_file:
label = 1 # 垃圾邮件标签为1
else:
label = 0 # 正常邮件标签为0
# 打开邮件文件
with open(os.path.join(DATA_DIR, mail_folder, mail_file), encoding="latin-1") as f:
data.append(f.read())
target.append(label)
return data, target
def preprocess(text):
text = text.lower() # 转换为小写
text = re.sub(f'[{string.punctuation}]', ' ', text) # 去除标点符号
text = [word for word in text.split() if word not in stopwords] # 去除停用词
return text
class NaiveBayesClassifier():
def __init__(self):
self.vocabulary = set() # 词汇表
self.class_total = defaultdict(int) # 每个类别的文档数
self.word_total = defaultdict(int) # 每个类别中所有单词出现次数之和
self.word_given_class = defaultdict(lambda: defaultdict(int)) # 每个类别中每个单词出现次数
def fit(self, X, y):
for text, label in zip(X, y):
words = preprocess(text)
self.class_total[label] += 1
for word in words:
self.vocabulary.add(word)
self.word_given_class[label][word] += 1
self.word_total[label] += 1
def predict(self, X):
log_priors = {}
for c in self.class_total.keys():
log_priors[c] = math.log(self.class_total[c] / sum(self.class_total.values()))
predictions = []
for text in X:
words = preprocess(text)
log_probs = {}
for c in self.class_total.keys():
log_probs[c] = log_priors[c]
for word in words:
# 对未见过的单词进行平滑处理
log_probs[c] += math.log((self.word_given_class[c][word] + 1) / (self.word_total[c] + len(self.vocabulary)))
predictions.append(max(log_probs, key=log_probs.get))
return predictions
# 加载数据集
X, y = get_data(DATA_DIR)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建实例并训练模型
clf = NaiveBayesClassifier()
clf.fit(X_train, y_train)
# 对新邮件进行分类
new_email = 'Subject: et & s photo contest - announcing the winners\nCongratulations to the following winners of the 2001 ET & S photo contest. Over 200 entries were submitted! The winning photos will be displayed in the 2001 ET & S public education calendar.'
prediction = clf.predict([new_email])[0]
# 在测试集上进行预测并计算准确率
predictions = clf.predict(X_test)
accuracy = np.sum(np.array(predictions) == np.array(y_test)) / len(y_test)
# 创建一个列表来存储每封邮件的预测结果
predicted_emails = []
# 遍历每封测试邮件,并将预测结果添加到列表中
for i, prediction_index in enumerate(predictions):
predicted_emails.append((i + 1, target_names[prediction_index]))
# 打印预测结果
for email_no, prediction in predicted_emails:
print(f'Email {email_no}: Prediction: {prediction}')
print(f'Prediction for new email: {target_names[prediction]}')
print(f'Accuracy: {accuracy:.2f}')
目的是对待分类邮件进行预测,2000封只运行出来500多封,运行结果出来还全是正常邮件,并且还报错。请问代码哪里出错了?