用CountVectorizer做情感分析,报错为:InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. 求大神指点!
from sklearn.feature_extraction.text import CountVectorizer
def get_custom_stopwords(stop_words_file):
with open(stop_words_file) as f:
stopwords = f.read()
stopwords_list = stopwords.split('\n')
custom_stopwords_list = [i for i in stopwords_list]
return custom_stopwords_list
stop_words_file = '哈工大停用词表.txt' #设置停用词
stopwords = get_custom_stopwords(stop_words_file)
vect = CountVectorizer(max_df = 0.8,
min_df = 3,
token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b',
stop_words=frozenset(stopwords))
#划分数据集
X = data['cut_comment']
y = data.sentiment
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
test = pd.DataFrame(vect.fit_transform(X_train).toarray(), columns=vect.get_feature_names())
test.head()