问题遇到的现象和发生背景
在文本主题建模的过程中出现了报错
问题相关代码:
for time in ['#B4','#C5','#C8','#D6','D23','D24']:
raw_data=''
for file in files:
if re.match(str(time),file):
with open(os.path.join(path,file), 'r',errors='ignore') as f:
raw_data=raw_data+'#'+f.read()
f.close()
data=raw_data.split('#')
data_vectorized = vectorizer.fit_transform(data)
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print_topics(lda_model, vectorizer)
print_topics(nmf_model, vectorizer)
print_topics(lsi_model, vectorizer)
print(time)
运行结果及报错内容
#B4
#C5
#C8
#D6
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/85/h9zn7p197rv7q9qq734vvvsr0000gn/T/ipykernel_4497/1169374454.py in <module>
10 data=raw_data.split('#')
11
---> 12 data_vectorized = vectorizer.fit_transform(data)
13
14 # Build a Latent Dirichlet Allocation Model
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
1342 )
1343 if max_doc_count < min_doc_count:
-> 1344 raise ValueError("max_df corresponds to < documents than min_df")
1345 if max_features is not None:
1346 X = self._sort_features(X, vocabulary)
ValueError: max_df corresponds to < documents than min_df
问题所在:
看不懂报错,不知道怎么解决