在运行
pyLDAvis.enable_notebook()
pic = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.save_html(pic, 'lda_pass'+str(n_topics)+'.html')
pyLDAvis.show(pic,local=False)
代码时出错
LDA模型可视化报错
- 写回答
- 好问题 0 提建议
- 追加酬金
- 关注问题
- 邀请回答
-
3条回答 默认 最新
关注 - 帮你找了个相似的问题, 你可以看下: https://ask.csdn.net/questions/4646672
- 你也可以参考下这篇文章:使用LDA分类器对邮件进行分类
- 除此之外, 这篇博客: LDA 学习笔记中的 2 完整的模型代码: 部分也许能够解决你的问题, 你可以仔细阅读以下内容或跳转源博客中阅读:
import re import time import jieba import jieba.posseg as pseg import numpy as np import numpy from sklearn.externals import joblib from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation from sklearn.model_selection import GridSearchCV
def read_data(fname): ''' 输入的数据是按照: title content answer tag tag1 存储的,在训练lda时,只需要使用title和answer. :return: list 每一项是一个问题以及其所有的答案,使用.分隔 ''' dic = {} with open(fname,'r',encoding='utf8') as f: for i in f: lst = i.strip().split('\t') if lst[0] not in dic: dic[lst[0]] = re.sub('[。?!,、….?!:]$','',lst[2]) else: dic[lst[0]] += '.' + re.sub('[。?!,、….?!:]$','',lst[2]) corpus = [re.sub('[。?!,、….?!:]$','',key) + '.' + dic[key] for key in dic] return corpus def jieba_cut(corpus,cut_file,stopwords_file): #结巴分词,并去除停用词(停用词为网上找到的中文停用词库),最后存储在cut_file中 stopwords = [] with open(stopwords_file,'r',encoding='utf8') as f: for i in f: stopwords.append(i.strip()) corpus_cut = [] n = 0 for s in corpus: s_cut = [w for w in jieba.cut(s) if w not in stopwords] corpus_cut.append(' '.join(s_cut)) n += 1 if n % 10000 == 0: print(n) f1 = open(cut_file, 'a', encoding='utf8') for i in corpus_cut: f1.write(i + '\n') f1.close() return corpus_cut def vec_model(cut_file): with open(cut_file, 'r', encoding='utf8') as f: corpus_cut = [i.strip() for i in f.readlines()] tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,stop_words='english') x = tf_vectorizer.fit_transform(corpus_cut) joblib.dump(tf_vectorizer,tf_ModelPath ) return x,tf_vectorizer,corpus_cut def read_vec_model(cut_file,tf_ModelPath): # 直接加载模型 with open(cut_file, 'r', encoding='utf8') as f: corpus_cut = [i.strip() for i in f.readlines()] tf_vectorizer = joblib.load(tf_ModelPath) x = tf_vectorizer.fit_transform(corpus_cut) return x,tf_vectorizer,corpus_cut def train(vec_data,tf_model,n_topics = 14,max_iter = 10,learning_method= 'batch'): ''' 训练lda模型并存储 :param vec_data: :param n_topics: :param max_iter: :param learning_method: :return: 返回最终的lda模型 ''' lda = LatentDirichletAllocation(n_topics=n_topics,max_iter=max_iter,learning_method=learning_method,max_doc_update_iter=5) print('train') a = time.time() lda.fit(vec_data) print(time.time() - a) n_top_words = 20 tf_feature_names = tf_model.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words) joblib.dump(lda, lda_ModelPath) return lda def print_top_words(model, feature_names, n_top_words): #打印每个主题下权重较高的term for topic_idx, topic in enumerate(model.components_): print( "Topic #%d:" % topic_idx) print( " ".join([feature_names[i]for i in topic.argsort()[:-n_top_words - 1:-1]])) print() print(model.components_) def grid_search(vec_data,tf_vectorizer,parameters): GridSearchCV def jieba_cut_transform(file): #读取结巴分词的数据,然后将其中的英文和字母全部去除 with open(file,'r',encoding='utf8') as f: ret_list = f.readlines() f1 = open('./jieba_cut_all_drop.txt','a',encoding='utf8') lst = [] for i in [[re.sub(r'[^\u4e00-\u9fa5]+','',j) for j in i.strip().split(' ') if re.sub(r'[a-zA-Z0-9]+','',j)] for i in ret_list]: f1.write(' '.join(i) + '\n') f1.close()
调用函数训练模型
lda_ModelPath = './lda_model2_all' tf_ModelPath = './tf_model1_all' fname = '../train_data' cut_file = './jieba_cut_all.txt' stopwords_file = './stopwords' #读取数据 corpus = read_data(fname) #分词 cut_data = jieba_cut(corpus, cut_file, stopwords_file) #训练词向量模型 vec_data,tf_vectorizer,cut_data = vec_model(cut_file) ##这是直接读取数据的函数 #vec_data,tf_vectorizer,cut_data = read_vec_model(cut_file, tf_ModelPath) lda = train(vec_data,tf_vectorizer,max_iter=50) # 计算困惑度 # lda = joblib.load(lda_ModelPath) p = lda.perplexity(vec_data) print(p) test_data = vec_data[:10] ret = lda.transform(vec_data[:10]) print(ret) print(ret.argmax(1)) print(cut_data[:10]) print(len(cut_data[:10]))
- 您还可以看一下 唐宇迪老师的机器学习30天进阶实战课程中的 LDA数学原理推导小节, 巩固相关知识点
解决 无用评论 打赏 举报
悬赏问题
- ¥15 远程访问linux主机超时
- ¥15 odoo17存货管理优势于中国国内该行业传统ERP或MES的详细解读和举例
- ¥15 CPU卡指令整合指令数据都在图片上
- ¥15 火车票关联12306问题
- ¥15 odoo17处理受托加工产品
- ¥15 如何用MATLAB编码图三的积分
- ¥15 圆孔衍射光强随孔径变化
- ¥15 MacBook pro m3max上用vscode运行c语言没有反应
- ¥15 ESP-PROG配置错误,ALL ONES
- ¥15 结构功能耦合指标计算