问题相关代码,请勿粘贴截图
import os
import jieba
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
output_dir = r'output'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
# 查看训练数据
train_data = pd.read_csv('data/classify_train.csv', encoding='gbk')
print(train_data.head())
# 载入停用词
stopwords = pd.read_csv("data/stopwords.txt", index_col=False, sep="\t", quoting=3, names=['stopword'], encoding='utf-8')
# stopwords=set()
# with open('data/stopwords.txt','r') as infile:
# for line in infile:
# line = line.rstrip('\n')
# if line:
# stopwords.add(line.lower())
# min_df去掉df值小的词(这样的词一般是非常专业的名词或则是生僻词,是噪音)max_df是去掉df很大的词,这样的词是常用词去掉不要
tfidf = TfidfVectorizer(tokenizer=jieba.lcut, stop_words=stopwords, min_df=50, max_df=0.3)
# 编码x变量
x=tfidf.fit_transform(train_data[u'内容'])
train_data[u'内容']:
运行结果及报错内容
报如下错误
Traceback (most recent call last):
File "D:/PyCharm/flaskProject/BOW.py", line 36, in <module>
x=tfidf.fit_transform(train_data[u'内容'])
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 2077, in fit_transform
X = super().fit_transform(raw_documents)
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1330, in fit_transform
vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 1193, in _count_vocab
analyze = self.build_analyzer()
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 446, in build_analyzer
stop_words = self.get_stop_words()
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 368, in get_stop_words
return _check_stop_list(self.stop_words)
File "D:\PyCharm\flaskProject\venv\lib\site-packages\sklearn\feature_extraction\text.py", line 185, in _check_stop_list
if stop == "english":
File "D:\PyCharm\flaskProject\venv\lib\site-packages\pandas\core\generic.py", line 1527, in __nonzero__
raise ValueError(
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
我的解答思路和尝试过的方法
尝试过查看是否是空值,去掉空值后发现仍然不对