分词之后 去除停用词运行结果出问题了 有人知道是什么原因吗?
import pandas as pd # pandas模块用于读取和处理数据
import jieba # jieba模块用于对短信内容进行分词
from sklearn.feature_extraction.text import CountVectorizer # 文本特征抽取
from sklearn.model_selection import train_test_split # 划分训练集和测试集
def swindle():
# 读取数据和处理数据
data = pd.read_table("swindle_text.txt", sep="\t", header=None, nrows=10000, names=["标签", "短信内容"])
new_data = []
# 进行分词
data['分词后数据'] = data["短信内容"].apply(lambda x: ' '.join(jieba.cut(x)))
# 去除停用词
# stpwrdpath = "/Users/mustafa-de/PycharmProjects/pythonProject/mechine_swindle/stop_words.txt"
with open('stop_words.txt', 'rb') as f:
stopword = f.read().decode('utf-8') # 停用词提取
stpwrdlst = stopword.splitlines() # 将停用词表转换为list
for word in data:
if word not in stpwrdlst:
new_data.append(word)
print(new_data)
# 提取特征
# 分割训练集和测试集
return None
if __name__ == "__main__":
swindle()