问题遇到的现象和发生背景
代码已经写好了,一开始运行时显示缺少gensim、pandas和nltk包,后来我安装上了,但是 运行时还是报错。
问题相关代码,请勿粘贴截图
# -*- coding: utf-8 -*-
"""
Dataset Input
"""
import pandas as pd
listings_raw = pd.DataFrame(pd.read_csv('D:\研一\大数据\A\数据来源/listings.csv'))# 创建一个二维表
reviews_raw = pd.DataFrame(pd.read_csv('D:\研一\大数据\A\数据来源/reviews.csv'))
listings_raw.rename(columns = {'id':'listing_id'}, inplace = True) # 将源数据列名改为新的列名
listings = listings_raw.copy()
reviews = reviews_raw.copy().dropna() #将空值所在的行/列删除
"""
Filter listings
挑选 'room_type' = 'Private room'
40629->19532
挑选 'beds' = 1
40629->13602
挑选 'number_of_reviews'>10
40629->27625
全部
40629->5024
"""
listings = listings.loc[listings['room_type'] == 'Private room']
listings = listings.loc[listings['number_of_reviews'] > 10 ]
listings = listings.loc[listings['beds'] == 1 ]
"""
listing和reviews mapping
"""
reviews['room_type'] = reviews['listing_id'].map(listings.set_index('listing_id')['room_type'])
reviews['number_of_reviews'] = reviews['listing_id'].map(listings.set_index('listing_id')['number_of_reviews'])
df = reviews.copy()
"""
Pre-process Phase
"""
from nltk.corpus import stopwords
import time
# The TypeError: 'float' object is not iterable could happen if the data is missing a value
df = df.dropna()
pre_start = time.time()
# Remove Punctuations
import string
df['comments'] = [''.join(c for c in s if c not in string.punctuation) for s in df['comments']]
print("Remove Punctuations : ")
df['comments'].head(10)
# Transform to lowcase and split
df['comments'] = df['comments'].str.lower().str.split()
print("lowcase and split : ")
df['comments'].head(10)
# Remove stopwords
stop = stopwords.words('english')
df['comments'] = df['comments'].apply(lambda x: [item for item in x if item not in stop])
print("Remove stopwords : ")
df['comments'].head(10)
# Stemming
from nltk.stem import RegexpStemmer
st = RegexpStemmer('ing$|s$|e$|able$', min=4)
for x in df['comments']:
for y in x:
y = st.stem(y)
print("Stemming : ")
df['comments'].head(10)
# Remove Strings which length > 3
df['comments'] = df['comments'].apply(lambda x: [item for item in x if len(item)>3 ])
print("Remove Strings which length > 3 : ")
df['comments'].head(10)
pre_end = time.time()
print("It cost %f sec" % (pre_end - pre_start))
"""
Group Comments by the column of 'listing_id'
"""
df2 = df[['listing_id', 'comments']].copy()
# To return a Dataframe
df2 = df2.groupby('listing_id').apply(lambda x: x.sum())
"""
LDA Phase
"""
# Establish dictionary and corpus
lda_start = time.time()
from gensim import corpora, models
dictionary = corpora.Dictionary(df2['comments'])
corpus = [ dictionary.doc2bow(text) for text in df2['comments'] ]
# Transform Bag-of-Words to TF/IDF
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
from nltk.probability import FreqDist
fdist = FreqDist(dictionary)
top_ten = fdist.most_common(1000)
lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=20)
#lda = models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=20, workers=3)
lda_end = time.time()
print("It cost %f sec" % (lda_end - lda_start))
# Print Top20 topics
lda.print_topics(20)
# Print the dist. of 20th topic
lda.print_topic(19)
运行结果及报错内容
#百度了一下没有相关结果,报错内容如下
PS C:\Users\月落青山外> & D:/python/python.exe d:/研一/大数据/代码/差不多代码/Airbnb-comments-LDA-master/LDA修改.py
Traceback (most recent call last):
File "d:/研一/大数据/代码/差不多代码/Airbnb-comments-LDA-master/LDA修改.py", line 45, in <module>
from nltk.corpus import stopwords
from nltk.collocations import *
File "D:\python\lib\site-packages\nltk\collocations.py", line 36, in <module>
from nltk.metrics import (
File "D:\python\lib\site-packages\nltk\metrics\__init__.py", line 18, in <module>
from nltk.metrics.association import (
File "D:\python\lib\site-packages\nltk\metrics\association.py", line 26, in <module>
from scipy.stats import fisher_exact
File "D:\python\lib\site-packages\scipy\stats\__init__.py", line 468, in <module>
from ._rvs_sampling import rvs_ratio_uniforms, NumericalInverseHermite # noqa
File "D:\python\lib\site-packages\scipy\stats\_rvs_sampling.py", line 3, in <module>
from ._unuran import unuran_wrapper
File "unuran_wrapper.pyx", line 221, in init scipy.stats._unuran.unuran_wrapper
File "unuran_wrapper.pyx", line 200, in scipy.stats._unuran.unuran_wrapper._setup_unuran
File "messagestream.pyx", line 36, in scipy._lib.messagestream.MessageStream.__cinit__
OSError: Failed to open file b'C:\\Users\\\xe6\x9c\x88\xe8\x90\xbd\xe9\x9d\x92~1\\AppData\\Local\\Temp\\scipy-kzl3auzb'
我的解答思路和尝试过的方法
我尝试将visual studio code 卸载重装,但是还是报同样的错误
我想要达到的结果
请大家看一看如何解决,在此万分感谢