Batches到 100%时就出现以下报错信息:TypeError: 'numpy.float64' object cannot be interpreted as an integer
数据是中文期刊的摘要,不知道是哪里出现问题了。感谢!
import pandas as pd
import jieba
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
file_path = ''
data = pd.read_excel(file_path)
# 读取和合并停用词表
stopwords = set()
for file in ['hit_stopwords.txt', 'scu_stopwords.txt', 'baidu_stopwords.txt', 'cn_stopwords.txt']:
with open(file, 'r', encoding='utf-8') as f:
stopwords.update([line.strip() for line in f.readlines()])
# 分词和去除停用词
def preprocess(text):
words = jieba.cut(text)
return ' '.join(word for word in words if word not in stopwords)
data['processed'] = data['摘要'].apply(preprocess)
# 加载预训练的中文句子嵌入模型
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
# 为处理后的文档生成嵌入
embeddings = model.encode(data['processed'].tolist(), show_progress_bar=True)
# 创建BERTopic模型
topic_model = BERTopic(language="multilingual", calculate_probabilities=True)
# 使用文档和生成的嵌入训练模型
topics, probabilities = topic_model.fit_transform(data['processed'], embeddings)
for topic in topic_model.get_topic_info().head(10)['Name']:
print(topic)