PyCharm上运行CoherenceModel的代码时,会出现代码反复从头开始运行的情况,但是用jupyter运行就很正常,只会循环需要循环运行的部分。
代码如下:
import pandas as pd
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]
def compute_coherence_values(corpus, dictionary, texts, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=dictionary,
num_topics=k,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))
# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [ # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
# Can take a long time to run
if __name__ == '__main__':
if 1 == 1:
pbar = tqdm.tqdm(total=4)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
pbar.close()
print('end')
pycharm的结果:
start
0%| | 0/4 [00:00<?, ?it/s]start
end
start
end
start
end
start
end
start
end
start
end
25%|██▌ | 1/4 [00:19<00:59, 19.72s/it]start
end
start
end
start
end
start
end
start
end
start
end
50%|█████ | 2/4 [00:39<00:39, 19.87s/it]start
end
start
end
start
end
start
end
start
end
start
end
75%|███████▌ | 3/4 [00:58<00:19, 19.27s/it]start
end
start
end
start
end
start
end
start
end
start
end
100%|██████████| 4/4 [01:14<00:00, 18.72s/it]
end
Process finished with exit code 0
jupyter的结果:
0%| | 0/4 [00:00<?, ?it/s]
star
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:19<00:00, 4.95s/it]
end
可以看到pycharm每次都会从第一行代码开始运行,而jupyter就只会循环需要的部分。
不懂为什么pycharm和jupyter运行的结果不一样,我希望让pycharm也能运行像jupyter一样的结果,希望可以帮我解答一下
然后我自己实验了,只要把关于计算Coherence的代码删掉,pycharm就能正常运行,比如这样:
print('start')
TEXT = pd.read_table('d:/0528/reviews.TXT', encoding='utf-8')
def sent_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True)) # deacc=True removes punctuations
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
id2word = corpora.Dictionary(data_words)
texts =data_words
corpus = [id2word.doc2bow(text) for text in texts]
def compute_coherence_values(corpus, dictionary, texts, k, a, b):
lda_model = gensim.models.LdaMulticore(corpus=corpus,
id2word=dictionary,
num_topics=k,
random_state=100,
chunksize=100,
passes=10,
alpha=a,
eta=b)
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words, dictionary=id2word, coherence='c_v')
return coherence_model_lda.get_coherence()
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 4
max_topics = 6
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 0.02, 0.01))
# Beta parameter
beta = list(np.arange(0.01, 0.1, 0.05))
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [ # gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.25)),
# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5),
# gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)),
corpus]
corpus_title = ['100% Corpus']
model_results = {'Validation_Set': [],
'Topics': [],
'Alpha': [],
'Beta': [],
#'Coherence': []
}
# Can take a long time to run
if __name__ == '__main__':
if 1 == 1:
pbar = tqdm.tqdm(total=4)
# iterate through validation corpuses
for i in range(len(corpus_sets)):
# iterate through number of topics
for k in topics_range:
# iterate through alpha values
for a in alpha:
# iterare through beta values
for b in beta:
# get the coherence score for the given parameters
data_words = list(sent_to_words(TEXT['ConsumerReviews']))
#cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, texts=data_words,
# k=k, a=a, b=b)
# Save the model results
model_results['Validation_Set'].append(corpus_title[i])
model_results['Topics'].append(k)
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
#model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('D:/0528/lda_tuning_results.csv', index=False)
pbar.close()
print('end')