zhuzhuzhanshi 2019-05-02 11:17 采纳率: 0%
浏览 4004
已结题

python运行有错误:这是对数据进行分析生成可视化界面的程序(我是小白,请说下解决方法)

运行错误:
C:\Users\Administrator\PycharmProjects\untitled\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py
一共有:16590个
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.808 seconds.
Prefix dict has been built succesfully.
Traceback (most recent call last):
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 252, in
jiebaclearText(content)
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 97, in jiebaclearText
f_stop_text = f_stop.read()
File "D:\python111\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 3: invalid start byte

Process finished with exit code 1

代码如下:
'''
data : 2019.3.28
goal : 可视化分析获取到的数据
'''
import csv

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''

读数据

def read_csv():
content = ''
# 读取文件内容
with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:
# 读文件
reader = csv.reader(file_test)
i = 0
for row in reader:
if i != 0:
time.append(row[0])
nickName.append(row[1])
gender.append(row[2])
cityName.append(row[3])
userLevel.append(row[4])
score.append(row[5])
content = content + row[6]
# print(row)
i = i + 1
print('一共有:' + str(i - 1) + '个')
return content

import re, jieba

词云生成工具

from wordcloud import WordCloud, ImageColorGenerator

需要对中文进行处理

import matplotlib.font_manager as fm
from pylab import *

mpl.rcParams['font.sans-serif'] = ['SimHei']
from os import path

d = path.dirname(__file__)

stopwords_path = 'D:\ku\chineseStopWords.txt'

评论词云分析

def word_cloud(content):
import jieba, re, numpy
from pyecharts import WordCloud
import pandas as pd

# 去除所有评论里多余的字符
content = content.replace(" ", ",")
content = content.replace(" ", "、")
content = re.sub('[,,。. \r\n]', '', content)

segment = jieba.lcut(content)
words_df = pd.DataFrame({'segment': segment})
# quoting=3 表示stopwords.txt里的内容全部不引用
stopwords = pd.read_csv(stopwords_path, index_col=False, quoting=3, sep="\t", names=['stopword'],
                        encoding='utf-8')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
test = words_stat.head(500).values
codes = [test[i][0] for i in range(0, len(test))]
counts = [test[i][1] for i in range(0, len(test))]
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("影评词云", codes, counts, word_size_range=[20, 100])
wordcloud.render(d + "\picture\c_wordcloud.html")

定义个函数式用于分词

def jiebaclearText(text):
# 定义一个空的列表,将去除的停用词的分词保存
mywordList = []
text = re.sub('[,,。. \r\n]', '', text)
# 进行分词
seg_list = jieba.cut(text, cut_all=False)
# 将一个generator的内容用/连接
listStr = '/'.join(seg_list)
listStr = listStr.replace("class", "")
listStr = listStr.replace("span", "")
listStr = listStr.replace("悲伤逆流成河", "")
# 打开停用词表
f_stop = open(stopwords_path, encoding="utf8")
# 读取
try:
f_stop_text = f_stop.read()
finally:
f_stop.close() # 关闭资源
# 将停用词格式化,用\n分开,返回一个列表
f_stop_seg_list = f_stop_text.split("\n")
# 对默认模式分词的进行遍历,去除停用词
for myword in listStr.split('/'):
# 去除停用词
if not (myword.split()) in f_stop_seg_list and len(myword.strip()) > 1:
mywordList.append(myword)
return ' '.join(mywordList)

生成词云图

def make_wordcloud(text1):
text1 = text1.replace("悲伤逆流成河", "")
bg = plt.imread(d + "/static/znn1.jpg")
# 生成
wc = WordCloud( # FFFAE3
background_color="white", # 设置背景为白色,默认为黑色
width=890, # 设置图片的宽度
height=600, # 设置图片的高度
mask=bg,
# margin=10, # 设置图片的边缘
max_font_size=150, # 显示的最大的字体大小
random_state=50, # 为每个单词返回一个PIL颜色
font_path=d + '/static/simkai.ttf' # 中文处理,用系统自带的字体
).generate_from_text(text1)
# 为图片设置字体
my_font = fm.FontProperties(fname=d + '/static/simkai.ttf')
# 图片背景
bg_color = ImageColorGenerator(bg)
# 开始画图
plt.imshow(wc.recolor(color_func=bg_color))
# 为云图去掉坐标轴
plt.axis("off")
# 画云图,显示
# 保存云图
wc.to_file(d + r"/picture/word_cloud.png")

评论者性别分布可视化

def sex_distribution(gender):
# print(gender)
from pyecharts import Pie
list_num = []
list_num.append(gender.count('0')) # 未知
list_num.append(gender.count('1')) # 男
list_num.append(gender.count('2')) # 女
attr = ["其他", "男", "女"]
pie = Pie("性别饼图")
pie.add("", attr, list_num, is_label_show=True)
pie.render(d + r"\picture\sex_pie.html")

评论者所在城市分布可视化

def city_distribution(cityName):
city_list = list(set(cityName))
city_dict = {city_list[i]: 0 for i in range(len(city_list))}
for i in range(len(city_list)):
city_dict[city_list[i]] = cityName.count(city_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
city_name = []
city_num = []
for i in range(len(sort_dict)):
city_name.append(sort_dict[i][0])
city_num.append(sort_dict[i][1])

import random
from pyecharts import Bar
bar = Bar("评论者城市分布")
bar.add("", city_name, city_num, is_label_show=True, is_datazoom_show=True)
bar.render(d + r"\picture\city_bar.html")

每日评论总数可视化分析

def time_num_visualization(time):
from pyecharts import Line
time_list = list(set(time))
time_dict = {time_list[i]: 0 for i in range(len(time_list))}
time_num = []
for i in range(len(time_list)):
time_dict[time_list[i]] = time.count(time_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(time_dict.items(), key=lambda d: d[0], reverse=False)
time_name = []
time_num = []
print(sort_dict)
for i in range(len(sort_dict)):
time_name.append(sort_dict[i][0])
time_num.append(sort_dict[i][1])

line = Line("评论数量日期折线图")
line.add(
    "日期-评论数",
    time_name,
    time_num,
    is_fill=True,
    area_color="#000",
    area_opacity=0.3,
    is_smooth=True,
)
line.render(d + r"\picture\c_num_line.html")

评论者猫眼等级、评分可视化

def level_score_visualization(userLevel, score):
from pyecharts import Pie
userLevel_list = list(set(userLevel))
userLevel_num = []
for i in range(len(userLevel_list)):
userLevel_num.append(userLevel.count(userLevel_list[i]))

score_list = list(set(score))
score_num = []
for i in range(len(score_list)):
    score_num.append(score.count(score_list[i]))

pie01 = Pie("等级环状饼图", title_pos='center', width=900)
pie01.add(
    "等级",
    userLevel_list,
    userLevel_num,
    radius=[40, 75],
    label_text_color=None,
    is_label_show=True,
    legend_orient="vertical",
    legend_pos="left",
)
pie01.render(d + r"\picture\level_pie.html")
pie02 = Pie("评分玫瑰饼图", title_pos='center', width=900)
pie02.add(
    "评分",
    score_list,
    score_num,
    center=[50, 50],
    is_random=True,
    radius=[30, 75],
    rosetype="area",
    is_legend_show=False,
    is_label_show=True,
)
pie02.render(d + r"\picture\score_pie.html")

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''
content = read_csv()

1 词云

jiebaclearText(content)
make_wordcloud(content)

pyecharts词云

word_cloud(content)

2 性别分布

sex_distribution(gender)

3 城市分布

city_distribution(cityName)

4 评论数

time_num_visualization(time)

5 等级,评分

level_score_visualization(userLevel, score)

  • 写回答

1条回答

  • wangshubin5 2019-05-02 15:03
    关注

    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 0

    Python 编码中编码解码的问题,我这个错误就是‘utf-8’不能解码位置0的那个字节(0xa1),也就是这个字节超出了utf-8的表示范围了

    解决办法:

    with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:

    修改:**with open(r'D:\maoyan.csv', 'r', encoding='utf-8', newline='') as file_test:**

    也就是在读取数据的时候,显式添加编码方式encoding='utf_8',别的编码也可以试试哟。如:gb18030

    评论

报告相同问题?

悬赏问题

  • ¥15 js调用html页面需要隐藏某个按钮
  • ¥15 ads仿真结果在圆图上是怎么读数的
  • ¥20 Cotex M3的调试和程序执行方式是什么样的?
  • ¥20 java项目连接sqlserver时报ssl相关错误
  • ¥15 一道python难题3
  • ¥15 牛顿斯科特系数表表示
  • ¥15 arduino 步进电机
  • ¥20 程序进入HardFault_Handler
  • ¥15 oracle集群安装出bug
  • ¥15 关于#python#的问题:自动化测试