PyCharm上运行计算最佳CoherenceModel的代码时，会出现代码反复从头开始运行的情况，但是用jupyter运行就很正常，只会循环需要循环运行的部分。

PyCharm上运行CoherenceModel的代码时，会出现代码反复从头开始运行的情况，但是用jupyter运行就很正常，只会循环需要循环运行的部分。
代码如下：

import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel

print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(TEXT['ConsumerReviews']))

bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]


def compute_coherence_values(corpus, dictionary, texts, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))

# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
    # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
    # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
    corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                 }
# Can take a long time to run
if __name__ == '__main__':
    if 1 == 1:
        pbar = tqdm.tqdm(total=4)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                                                      k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
        pbar.close()
print('end')

pycharm的结果：

start
  0%|          | 0/4 [00:00<?, ?it/s]start
end
start
end
start
end
start
end
start
end
start
end
 25%|██▌       | 1/4 [00:19<00:59, 19.72s/it]start
end
start
end
start
end
start
end
start
end
start
end
 50%|█████     | 2/4 [00:39<00:39, 19.87s/it]start
end
start
end
start
end
start
end
start
end
start
end
 75%|███████▌  | 3/4 [00:58<00:19, 19.27s/it]start
end
start
end
start
end
start
end
start
end
start
end
100%|██████████| 4/4 [01:14<00:00, 18.72s/it]
end

Process finished with exit code 0

jupyter的结果:

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
star
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00,  4.95s/it]
end

可以看到pycharm每次都会从第一行代码开始运行，而jupyter就只会循环需要的部分。
不懂为什么pycharm和jupyter运行的结果不一样，我希望让pycharm也能运行像jupyter一样的结果，希望可以帮我解答一下

然后我自己实验了，只要把关于计算Coherence的代码删掉，pycharm就能正常运行,比如这样：


print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(TEXT['ConsumerReviews']))

bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]


def compute_coherence_values(corpus, dictionary, texts, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))

# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
    # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
    # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
    corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 #'Coherence': []
                 }
# Can take a long time to run
if __name__ == '__main__':
    if 1 == 1:
        pbar = tqdm.tqdm(total=4)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                        #cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                        #                              k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        #model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
        pbar.close()
print('end')

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

5条回答默认最新

hyh123a 全栈领域新星创作者 2022-05-28 16:52

关注

修改了下代码顺序，你试下这样可以不

另外
你上边是所有代码吗？本地跑了下会报错KeyError: 'ConsumerReviews'，
方便说下你的python版本和其他库的版本吗

import pandas as pd
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel




def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations



def compute_coherence_values(corpus, dictionary, texts, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k,
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)

    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')

    return coherence_model_lda.get_coherence()

# Can take a long time to run
if __name__ == '__main__':

    print('start')
    TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')

    data_words = list(sent_to_words(TEXT['ConsumerReviews']))

    bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100)  # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    id2word = corpora.Dictionary(data_words)
    texts = data_words
    corpus = [id2word.doc2bow(text) for text in texts]

    grid = {}
    grid['Validation_Set'] = {}
    # Topics range
    min_topics = 4
    max_topics = 6
    step_size = 1
    topics_range = range(min_topics, max_topics, step_size)
    # Alpha parameter
    alpha = list(np.arange(0.01, 0.02, 0.01))

    # Beta parameter
    beta = list(np.arange(0.01, 0.1, 0.05))

    # Validation sets
    num_of_docs = len(corpus)
    corpus_sets = [  # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
        # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
        # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
        corpus]
    corpus_title = ['100% Corpus']
    model_results = {'Validation_Set': [],
                     'Topics': [],
                     'Alpha': [],
                     'Beta': [],
                     'Coherence': []
                     }

    if 1 == 1:
        pbar = tqdm.tqdm(total=4)
        # iterate through validation corpuses
        for i in range(len(corpus_sets)):
            # iterate through number of topics
            for k in topics_range:
                # iterate through alpha values
                for a in alpha:
                    # iterare through beta values
                    for b in beta:
                        # get the coherence score for the given parameters
                        data_words = list(sent_to_words(TEXT['ConsumerReviews']))
                        cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
                                                      k=k, a=a, b=b)
                        # Save the model results
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(k)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence'].append(cv)

                        pbar.update(1)
        pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
        pbar.close()
    print('end')