我最后得出的主题-时间矩阵全是0,主题在不同时间点的热度全是NaN,这是为什么啊?我想用它呈现主题阶段热力图。明明前面DTM跑出来得到的“时间——主题矩阵、主题——时间矩阵以及文档——主题矩阵”,都有值啊,为什么最后算出来是0
```python
import os
import re
import numpy as np
import math
import nltk
import spacy ####安装英文包!!!!
from nltk.corpus import wordnet
from nltk import MWETokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
wnl = WordNetLemmatizer()
def get_stop_dict(file):
content = open(file,encoding="utf-8")
word_list = []
for c in content:
c = re.sub('\n|\r','',c)
word_list.append(c)
return word_list
def get_wordnet_pos(tag):
if tag.startswith('N'):
return wordnet.NOUN
else:
return None
#os.chdir("D:/jupyter_file/work/DTM/DTM_一致性/")
os.chdir("C:/Users/123/Documents/Python Scripts/4nltk dtm")###修改文件路径!!!!!!!!!!!
text = open("0113.txt",encoding='utf-8').read()
stop_file = "stopwords.txt"#!!!!!
dic_file = "dict.txt"#!!!!!!!
synonym_file = "synonym.xlsx"#!!!!!!!!!
stop_list = get_stop_dict(stop_file)
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 8000000
doc = nlp(text)
# 自动提取短语
phrases = []
for ent in doc.ents:
if ' ' in ent.text:
phrases.append(ent.text)
f = open('短语.txt','w',encoding='utf-8')
for p in phrases:
f.write(p+'\n')
f.close()
## 自定义词组
dicts = open(dic_file,encoding='utf-8').readlines()
dict_tuple = []
dicts.extend(phrases)
for d in dicts:
d = d.replace('\n','')
d = tuple(d.split(' '))
if d not in dict_tuple:
dict_tuple.append(d)
def english_word_cut(t):
#去除标点符号
t = t.lower()
for c in string.punctuation:
if c !='-':
t = t.replace(c,' ')
if c =='-':
t = t.replace(c,'_')
#分词,添加自定义词组
tokenizer = MWETokenizer(dict_tuple, separator = '_')
wordlist = tokenizer.tokenize(nltk.word_tokenize(t))
#wordlist = nltk.word_tokenize(t)
filtered = [w for w in wordlist if w not in stop_list and w not in stopwords.words('english')]
refiltered =nltk.pos_tag(filtered)
#词形还原
lemmas_sent = []
for wordtag in refiltered:
wordnet_pos = get_wordnet_pos(wordtag[1]) or wordnet.NOUN
word = wnl.lemmatize(wordtag[0], pos=wordnet_pos)
if word in synonym_origin:
index = synonym_origin.index(word)
word = synonym_new[index]
lemmas_sent.append(word)
return lemmas_sent
c_list = []
synonym = pd.read_excel(synonym_file)
synonym_origin = list(synonym['origin'])
synonym_new = list(synonym['new'])
for c in text.split('\n'):
if len(c)>0:
c_list.append(english_word_cut(c))
print("##处理后:",c_list)
#方法1
import logging
from gensim import corpora #4.0.0版本
from six import iteritems
from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary, bleicorpus
#c_list = c_list[0:150]
id2word = corpora.Dictionary(c_list)
corpus = [id2word.doc2bow(sentence) for sentence in c_list]
time_slice=[18,28,33]
num_topics = 15
ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, time_slice=time_slice,id2word=id2word, num_topics=num_topics,passes=50)
corpusTopic = ldaseq.print_topics(time=0) # 输出指定时期主题分布,此处第一个时期主题分布
print(corpusTopic)
topicEvolution = ldaseq.print_topic_times(topic=14) # 查询指定主题在不同时期的演变,此处为第一个主题的
print(topicEvolution)
doc = ldaseq.doc_topics(0) # 查询指定文档的主题分布,此处为第一篇文档的主题分布
print (doc)
# 一致性得分
from gensim.models.coherencemodel import CoherenceModel
# 获取每个时间片的主题-词语分布
topics_per_time_slice = []
number_of_time_slices = len(time_slice)
for t in range(number_of_time_slices):
#topics = ldaseq.print_topics(time=t)
topics = ldaseq.dtm_coherence(time=t)
topics_per_time_slice.append(topics)
# 对每个时间片计算一致性得分并取平均值
dictionary = id2word
coherence_scores = []
for topics in topics_per_time_slice:
coherence_model = CoherenceModel(topics=topics, texts=c_list, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
coherence_scores.append(coherence_score)
avg_coherence = sum(coherence_scores) / len(coherence_scores)
print("一致性得分:", avg_coherence)
import pandas as pd
# 获取每个时间片的主题-词语分布
topics_at_each_time = [ldaseq.print_topics(time=t, top_terms=10) for t in range(ldaseq.num_time_slices)]
# 构建主题-时间矩阵
num_topics = ldaseq.num_topics
num_time_slices = ldaseq.num_time_slices
topic_time = np.zeros((num_topics, num_time_slices))
# 遍历每个时间片,填充主题-时间矩阵
for t, topics_at_time_t in enumerate(topics_at_each_time):
for topic_info in topics_at_time_t:
if not topic_info or not isinstance(topic_info[0], str): # 跳过空字符串和非字符串的情况
continue
topic_id_str, distribution_str = topic_info[0], str(topic_info[1]) # 主题标识符转换为字符串,分布信息
topic_id = int(re.search(r'\d+', topic_id_str).group()) # 从字符串中提取整数
distribution_values = [float(value.split('*')[0]) for value in re.findall(r"[-+]?\d*\.\d+|\d+", distribution_str)]
topic_time[topic_id, t] = sum(distribution_values)
# 构建 DataFrame
topic_time_df = pd.DataFrame(topic_time, columns=[f"Time_{t}" for t in range(num_time_slices)], index=[f"Topic_{topic}" for topic in range(num_topics)])
# 打印主题-时间矩阵
print("主题-时间矩阵:")
print(topic_time_df)
# 计算每个主题在不同时间点的热度
topic_heatmap = topic_time_df.apply(lambda x: x / x.sum(), axis=0)
# 打印结果
print("\n主题在不同时间点的热度:")
print(topic_heatmap)