不会打代码的计算机学习人
2022-05-25 09:03
采纳率: 0%
浏览 19

画词云图时,出现问题,之前可以运行成功,这一次运行就报错

问题遇到的现象和发生背景

画词云图时,出现问题,之前可以运行成功,这一次运行就报错了

问题相关代码,请勿粘贴截图
import os
import numpy as np
import pandas as pd
import re
import jieba.posseg as psg
import matplotlib.pyplot as plt
from gensim import corpora,models #主题挖掘,提取关键信息
from wordcloud import WordCloud,ImageColorGenerator
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#导入数据
raw_data=pd.read_csv('D:\文本挖掘\期末作业\京东商品评论.csv',encoding='gbk')
print(raw_data.head())
# raw_data.info()
# print(raw_data.columns)
# 二、数据预处理
# (一)去重
# 删除系统自动为客户做出的评论。
reviews=raw_data.copy()
reviews=reviews[['content', 'content_type']]
print('去重之前:',reviews.shape[0])
reviews=reviews.drop_duplicates()
print('去重之后:',reviews.shape[0])
# 清洗之前
content=reviews['content']
for i in range(5,10):
    print(content[i])
    print('-----------')
#清洗之后,将数字、字母、京东欧莱雅紫熨斗眼霜字样都删除
info=re.compile('[0-9a-zA-Z]|京东|欧莱雅|紫熨斗眼霜|眼霜|')
content=content.apply(lambda x: info.sub('',str(x)))  #替换所有匹配项
print(content.head())

for i in range(5,10):
    print(content[i])
    print('-----------')

# (三)分词、词性标注、去除停用词、词云图
# (1)分词
#分词,由元组组成的list
seg_content=content.apply( lambda s:  [(x.word,x.flag) for x in psg.cut(s)] )
print(seg_content.shape)
# len(seg_content)
print(seg_content[5])
#统计评论词数
n_word=seg_content.apply(lambda s: len(s))

# len(n_word)
n_word.head(6)
#得到各分词在第几条评论
n_content=[ [x+1]*y for x,y in zip(list(seg_content.index),list(n_word))] #[x+1]*y,表示复制y份,由list组成的list
index_content_long=sum(n_content,[]) #表示去掉[],拉平,返回list
# len(index_content_long)
sum([[2,2],[3,3,3]],[])
#分词及词性,去掉[],拉平
seg_content.head()
seg_content_long=sum(seg_content,[])
print(seg_content_long)
type(seg_content_long)
len(seg_content_long)
print(seg_content_long[0])
#得到加长版的分词、词性
word_long=[x[0] for x in seg_content_long]
nature_long=[x[1] for x in seg_content_long]

len(word_long)
len(nature_long)
#content_type拉长
n_content_type=[ [x]*y for x,y in zip(list(reviews['content_type']),list(n_word))] #[x+1]*y,表示复制y份
content_type_long=sum(n_content_type,[]) #表示去掉[],拉平

len(content_type_long)
review_long=pd.DataFrame({'index_content':index_content_long,
                        'word':word_long,
                        'nature':nature_long,
                        'content_type':content_type_long})
print(review_long.shape)
print(review_long.head())
#(2)去除标点符号、去除停用词
review_long['nature'].unique()
#去除标点符号
review_long_clean=review_long[review_long['nature']!='x'] #x表示标点符合
review_long_clean.shape
#导入停用词
stop_path=open('./data/stoplist.txt','r',encoding='UTF-8')
stop_words=stop_path.readlines()

# len(stop_words)
# stop_words[0:5]
#停用词,预处理
stop_words=[word.strip('\n') for word in stop_words]
# stop_words[0:5]
#得到不含停用词的分词表
word_long_clean=list(set(word_long)-set(stop_words))
len(word_long_clean)

review_long_clean=review_long_clean[review_long_clean['word'].isin(word_long_clean)]
print(review_long_clean.shape)
print('----------------------------------------')
# (3)在原df中,再增加一列,该分词在本条评论的位置
# 再次统计每条评论的分词数量
# n_word=review_long_clean.groupby('index_content').count()['word']
# # n_word
#
# index_word=[ list(np.arange(1,x+1)) for x in list(n_word)]
# index_word_long=sum(index_word,[]) #表示去掉[],拉平
#
# # len(index_word_long)
# review_long_clean['index_word']=index_word_long
# review_long_clean.head()
# review_long_clean.to_csv('./1_review_long_clean.csv')
# n_review_long_clean=review_long_clean[[ 'n' in nat for nat in review_long_clean.nature]]
# n_review_long_clean.shape
# n_review_long_clean.head()
# n_review_long_clean.nature.value_counts()
# n_review_long_clean.to_csv('./1_n_review_long_clean.csv')
# import collections
# word_counts = collections.Counter(review_long_clean)
# word_counts_top10 = word_counts.most_common(10)
# print('********************',word_counts_top10)#词频统计
font=r"C:\Windows\Fonts\msyh.ttc"
from PIL import Image
background = Image.open('./jdicon.jpg')
graph = np.array(background)
wordcloud = WordCloud(font_path='C:/Windows/Fonts/msyh.ttc',
                      mask=graph,
                      background_color='white',
                      max_font_size=150,
                      random_state=30)
print('*******************',Counter(review_long_clean.word.values))
word_count = Counter(review_long_clean.word.values)
# print(type(word_count))
ciyuntu = wordcloud.fit_words(word_count)

# background_image=plt.imread('./jdicon.jpg')
# wordcloud = WordCloud(font_path=font, max_words = 100, background_color='white',mask=background_image) #width=1600,height=1200, mode='RGBA'
# wordcloud.generate_from_frequencies(Counter(review_long_clean.word.values))
# wordcloud.to_file('1_分词后的词云图.png')

plt.figure(figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

运行结果及报错内容

img

我的解答思路和尝试过的方法
我想要达到的结果

1条回答 默认 最新

相关推荐 更多相似问题