CONGCONGAIMIAOWU 2022-05-28 13:40 采纳率: 66.7%
浏览 172
已结题

PyCharm上运行计算最佳CoherenceModel的代码时,会出现代码反复从头开始运行的情况,但是用jupyter运行就很正常,只会循环需要循环运行的部分。

PyCharm上运行CoherenceModel的代码时,会出现代码反复从头开始运行的情况,但是用jupyter运行就很正常,只会循环需要循环运行的部分。
代码如下:

import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(TEXT['ConsumerReviews']))

bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]


def compute_coherence_values(corpus, dictionary, texts, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))

# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
    # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
    # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
    corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                 }
# Can take a long time to run
if __name__ == '__main__':
    if 1 == 1:
        pbar = tqdm.tqdm(total=4)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                                                      k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
        pbar.close()
print('end')

pycharm的结果:

start
  0%|          | 0/4 [00:00<?, ?it/s]start
end
start
end
start
end
start
end
start
end
start
end
 25%|██▌       | 1/4 [00:19<00:59, 19.72s/it]start
end
start
end
start
end
start
end
start
end
start
end
 50%|█████     | 2/4 [00:39<00:39, 19.87s/it]start
end
start
end
start
end
start
end
start
end
start
end
 75%|███████▌  | 3/4 [00:58<00:19, 19.27s/it]start
end
start
end
start
end
start
end
start
end
start
end
100%|██████████| 4/4 [01:14<00:00, 18.72s/it]
end

Process finished with exit code 0


jupyter的结果:

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
star
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00,  4.95s/it]
end

可以看到pycharm每次都会从第一行代码开始运行,而jupyter就只会循环需要的部分。
不懂为什么pycharm和jupyter运行的结果不一样,我希望让pycharm也能运行像jupyter一样的结果,希望可以帮我解答一下

然后我自己实验了,只要把关于计算Coherence的代码删掉,pycharm就能正常运行,比如这样:


print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(TEXT['ConsumerReviews']))

bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]


def compute_coherence_values(corpus, dictionary, texts, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))

# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
    # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
    # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
    corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 #'Coherence': []
                 }
# Can take a long time to run
if __name__ == '__main__':
    if 1 == 1:
        pbar = tqdm.tqdm(total=4)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                        #cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                        #                              k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        #model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
        pbar.close()
print('end')
  • 写回答

5条回答 默认 最新

  • hyh123a 全栈领域新星创作者 2022-05-28 16:52
    关注

    修改了下代码顺序,你试下这样可以不

    另外
    你上边是所有代码吗?本地跑了下会报错KeyError: 'ConsumerReviews'
    方便说下你的python版本和其他库的版本吗

    import pandas as pd
    from warnings import simplefilter
    
    simplefilter(action='ignore', category=FutureWarning)
    import numpy as np
    import tqdm
    import gensim
    import gensim.corpora as corpora
    from gensim.models import CoherenceModel
    
    
    
    
    def sent_to_words(sentences):
        for sentence in sentences:
            yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
    
    
    
    def compute_coherence_values(corpus, dictionary, texts, k, a, b):
        lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                               id2word=dictionary,
                                               num_topics=k,
                                               random_state=100,
                                               chunksize=100,
                                               passes=10,
                                               alpha=a,
                                               eta=b)
    
        coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
    
        return coherence_model_lda.get_coherence()
    
    # Can take a long time to run
    if __name__ == '__main__':
    
        print('start')
        TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
    
        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
    
        bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100)  # higher threshold fewer phrases.
        trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
    
        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = gensim.models.phrases.Phraser(bigram)
        trigram_mod = gensim.models.phrases.Phraser(trigram)
    
        id2word = corpora.Dictionary(data_words)
        texts = data_words
        corpus = [id2word.doc2bow(text) for text in texts]
    
        grid = {}
        grid['Validation_Set'] = {}
        # Topics range
        min_topics = 4
        max_topics = 6
        step_size = 1
        topics_range = range(min_topics, max_topics, step_size)
        # Alpha parameter
        alpha = list(np.arange(0.01, 0.02, 0.01))
    
        # Beta parameter
        beta = list(np.arange(0.01, 0.1, 0.05))
    
        # Validation sets
        num_of_docs = len(corpus)
        corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
            # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
            # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
            corpus]
        corpus_title = ['100% Corpus']
        model_results = {'Validation_Set': [],
                         'Topics': [],
                         'Alpha': [],
                         'Beta': [],
                         'Coherence': []
                         }
    
        if 1 == 1:
            pbar = tqdm.tqdm(total=4)
            # iterate through validation corpuses
            for i in range(len(corpus_sets)):
                # iterate through number of topics
                for k in topics_range:
                    # iterate through alpha values
                    for a in alpha:
                        # iterare through beta values
                        for b in beta:
                            # get the coherence score for the given parameters
                            data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                            cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                                                          k=k, a=a, b=b)
                            # Save the model results
                            model_results['Validation_Set'].append(corpus_title[i])
                            model_results['Topics'].append(k)
                            model_results['Alpha'].append(a)
                            model_results['Beta'].append(b)
                            model_results['Coherence'].append(cv)
    
                            pbar.update(1)
            pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
            pbar.close()
        print('end')
    
    本回答被题主选为最佳回答 , 对您是否有帮助呢?
    评论 编辑记录
查看更多回答(4条)

报告相同问题?

问题事件

  • 系统已结题 6月5日
  • 已采纳回答 5月28日
  • 创建了问题 5月28日

悬赏问题

  • ¥15 流式socket文件传输答疑
  • ¥20 keepalive配置业务服务双机单活的方法。业务服务一定是要双机单活的方式
  • ¥50 关于多次提交POST数据后,无法获取到POST数据参数的问题
  • ¥15 win10,这种情况怎么办
  • ¥15 如何在配置使用Prettier的VSCode中通过Better Align插件来对齐等式?(相关搜索:格式化)
  • ¥100 在连接内网VPN时,如何同时保持互联网连接
  • ¥15 MATLAB中使用parfor,矩阵Removal的有效索引在parfor循环中受限制
  • ¥20 Win 10 LTSC 1809版本如何无损提升到20H1版本
  • ¥50 win10 LTSC 虚拟键盘不弹出
  • ¥15 寻找能匹配的液晶显示屏。