zhuzhuzhanshi 2019-05-02 11:17 采纳率: 0%

已结题

python运行有错误：这是对数据进行分析生成可视化界面的程序(我是小白，请说下解决方法)

运行错误：
C:\Users\Administrator\PycharmProjects\untitled\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py
一共有：16590个
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.808 seconds.
Prefix dict has been built succesfully.
Traceback (most recent call last):
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 252, in
jiebaclearText(content)
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 97, in jiebaclearText
f_stop_text = f_stop.read()
File "D:\python111\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 3: invalid start byte

Process finished with exit code 1

代码如下：
'''
data : 2019.3.28
goal : 可视化分析获取到的数据
'''
import csv

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''

读数据

def read_csv():
content = ''
# 读取文件内容
with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:
# 读文件
reader = csv.reader(file_test)
i = 0
for row in reader:
if i != 0:
time.append(row[0])
nickName.append(row[1])
gender.append(row[2])
cityName.append(row[3])
userLevel.append(row[4])
score.append(row[5])
content = content + row[6]
# print(row)
i = i + 1
print('一共有：' + str(i - 1) + '个')
return content

import re, jieba

词云生成工具

from wordcloud import WordCloud, ImageColorGenerator

需要对中文进行处理

import matplotlib.font_manager as fm
from pylab import *

mpl.rcParams['font.sans-serif'] = ['SimHei']
from os import path

d = path.dirname(__file__)

stopwords_path = 'D:\ku\chineseStopWords.txt'

评论词云分析

def word_cloud(content):
import jieba, re, numpy
from pyecharts import WordCloud
import pandas as pd

# 去除所有评论里多余的字符
content = content.replace(" ", ",")
content = content.replace(" ", "、")
content = re.sub('[,，。. \r\n]', '', content)

segment = jieba.lcut(content)
words_df = pd.DataFrame({'segment': segment})
# quoting=3 表示stopwords.txt里的内容全部不引用
stopwords = pd.read_csv(stopwords_path, index_col=False, quoting=3, sep="\t", names=['stopword'],
                        encoding='utf-8')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
test = words_stat.head(500).values
codes = [test[i][0] for i in range(0, len(test))]
counts = [test[i][1] for i in range(0, len(test))]
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("影评词云", codes, counts, word_size_range=[20, 100])
wordcloud.render(d + "\picture\c_wordcloud.html")

定义个函数式用于分词

def jiebaclearText(text):
# 定义一个空的列表，将去除的停用词的分词保存
mywordList = []
text = re.sub('[,，。. \r\n]', '', text)
# 进行分词
seg_list = jieba.cut(text, cut_all=False)
# 将一个generator的内容用/连接
listStr = '/'.join(seg_list)
listStr = listStr.replace("class", "")
listStr = listStr.replace("span", "")
listStr = listStr.replace("悲伤逆流成河", "")
# 打开停用词表
f_stop = open(stopwords_path, encoding="utf8")
# 读取
try:
f_stop_text = f_stop.read()
finally:
f_stop.close() # 关闭资源
# 将停用词格式化，用\n分开，返回一个列表
f_stop_seg_list = f_stop_text.split("\n")
# 对默认模式分词的进行遍历，去除停用词
for myword in listStr.split('/'):
# 去除停用词
if not (myword.split()) in f_stop_seg_list and len(myword.strip()) > 1:
mywordList.append(myword)
return ' '.join(mywordList)

生成词云图

def make_wordcloud(text1):
text1 = text1.replace("悲伤逆流成河", "")
bg = plt.imread(d + "/static/znn1.jpg")
# 生成
wc = WordCloud( # FFFAE3
background_color="white", # 设置背景为白色，默认为黑色
width=890, # 设置图片的宽度
height=600, # 设置图片的高度
mask=bg,
# margin=10, # 设置图片的边缘
max_font_size=150, # 显示的最大的字体大小
random_state=50, # 为每个单词返回一个PIL颜色
font_path=d + '/static/simkai.ttf' # 中文处理，用系统自带的字体
).generate_from_text(text1)
# 为图片设置字体
my_font = fm.FontProperties(fname=d + '/static/simkai.ttf')
# 图片背景
bg_color = ImageColorGenerator(bg)
# 开始画图
plt.imshow(wc.recolor(color_func=bg_color))
# 为云图去掉坐标轴
plt.axis("off")
# 画云图，显示
# 保存云图
wc.to_file(d + r"/picture/word_cloud.png")

评论者性别分布可视化

def sex_distribution(gender):
# print(gender)
from pyecharts import Pie
list_num = []
list_num.append(gender.count('0')) # 未知
list_num.append(gender.count('1')) # 男
list_num.append(gender.count('2')) # 女
attr = ["其他", "男", "女"]
pie = Pie("性别饼图")
pie.add("", attr, list_num, is_label_show=True)
pie.render(d + r"\picture\sex_pie.html")

评论者所在城市分布可视化

def city_distribution(cityName):
city_list = list(set(cityName))
city_dict = {city_list[i]: 0 for i in range(len(city_list))}
for i in range(len(city_list)):
city_dict[city_list[i]] = cityName.count(city_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
city_name = []
city_num = []
for i in range(len(sort_dict)):
city_name.append(sort_dict[i][0])
city_num.append(sort_dict[i][1])

import random
from pyecharts import Bar
bar = Bar("评论者城市分布")
bar.add("", city_name, city_num, is_label_show=True, is_datazoom_show=True)
bar.render(d + r"\picture\city_bar.html")

每日评论总数可视化分析

def time_num_visualization(time):
from pyecharts import Line
time_list = list(set(time))
time_dict = {time_list[i]: 0 for i in range(len(time_list))}
time_num = []
for i in range(len(time_list)):
time_dict[time_list[i]] = time.count(time_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(time_dict.items(), key=lambda d: d[0], reverse=False)
time_name = []
time_num = []
print(sort_dict)
for i in range(len(sort_dict)):
time_name.append(sort_dict[i][0])
time_num.append(sort_dict[i][1])

line = Line("评论数量日期折线图")
line.add(
    "日期-评论数",
    time_name,
    time_num,
    is_fill=True,
    area_color="#000",
    area_opacity=0.3,
    is_smooth=True,
)
line.render(d + r"\picture\c_num_line.html")

评论者猫眼等级、评分可视化

def level_score_visualization(userLevel, score):
from pyecharts import Pie
userLevel_list = list(set(userLevel))
userLevel_num = []
for i in range(len(userLevel_list)):
userLevel_num.append(userLevel.count(userLevel_list[i]))

score_list = list(set(score))
score_num = []
for i in range(len(score_list)):
    score_num.append(score.count(score_list[i]))

pie01 = Pie("等级环状饼图", title_pos='center', width=900)
pie01.add(
    "等级",
    userLevel_list,
    userLevel_num,
    radius=[40, 75],
    label_text_color=None,
    is_label_show=True,
    legend_orient="vertical",
    legend_pos="left",
)
pie01.render(d + r"\picture\level_pie.html")
pie02 = Pie("评分玫瑰饼图", title_pos='center', width=900)
pie02.add(
    "评分",
    score_list,
    score_num,
    center=[50, 50],
    is_random=True,
    radius=[30, 75],
    rosetype="area",
    is_legend_show=False,
    is_label_show=True,
)
pie02.render(d + r"\picture\score_pie.html")

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''
content = read_csv()

1 词云

jiebaclearText(content)
make_wordcloud(content)

pyecharts词云

word_cloud(content)

2 性别分布

sex_distribution(gender)

3 城市分布

city_distribution(cityName)

4 评论数

time_num_visualization(time)

5 等级，评分

level_score_visualization(userLevel, score)

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
wangshubin5 2019-05-02 15:03
关注
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 0

Python 编码中编码解码的问题，我这个错误就是‘utf-8’不能解码位置0的那个字节（0xa1），也就是这个字节超出了utf-8的表示范围了

解决办法：

with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:

修改：**with open(r'D:\maoyan.csv', 'r', encoding='utf-8', newline='') as file_test:**

也就是在读取数据的时候，显式添加编码方式encoding='utf_8'，别的编码也可以试试哟。如：gb18030

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

windows下bat运行python的exe程序报“Error Loading python DLL”错误 python
2022-04-26 23:33

回答 2 已采纳 1、先解决你原先报错的问题-D, --onedir 创建一个包含可执行文件的单文件夹捆绑包（默认）-F, --onefile 创建一个文件捆绑可执行文件。不加参数默认是-D，打包成一个文件
pycharm运行四个程序，两个可以正常运行，两个显示No Python at 'D:\Python\python.exe' pycharm python 有问必答
2022-01-02 17:13

回答 4 已采纳这是解释器路径的原因，因为你改了解释器，但是你之前的两个程序，已经存储了之前的Python解释器路径，和现有的不匹配，所以才会报错。如果答案，您满意，请采纳意见和点赞关注，支持一下，谢谢！！！
Python的可视化：类型错误：add_xaxis() 需要 2 个位置参数，但给出了 4 个 python 大数据
2021-11-16 10:22

回答 1 已采纳 .add_xaxis(['中','美','日']) 这样
基于Python的中国城市轨道交通数据可视化分析源码+项目说明.zip
2023-08-10 10:56

通过这个项目可以练习使用Python数据可视化分析相关的强大的库和模块，练习绘制简单的GUI界面并且连接数据库，更加深了对Python语言的学习和拓展。本项目也可作为学校的大作业、大实验实践或者课程设计等的选题项目...
python运行程序时IndexError: index 3 is out of bounds for axis 1 with size 3如何解决 python 开发语言有问必答
2022-02-28 00:31

回答 2 已采纳你train中索引index访问超出最大索引了
python运行问题IndexError: tuple index out of range python 有问必答
2022-03-18 17:09

回答 3 已采纳 data1.append((start_num[i], end_num[i], data[i][75]))中data[i][75]是取每行的第76个字段, 你数据库中有76个字段吗没有就是tuple
python多进程程序打包后，运行出现多个窗口，应该如果解决 python
2021-04-13 21:29

回答 1 已采纳 multiprocessing.freeze_support() 放在入口py文件（你的主程序）的if __name__=="__main__":的后面
Python数据可视化：可视化数据分析插件D-Tale(2)
2024-04-21 11:42

2401_83704252的博客接下来可以对浏览器中的可视化图表做任意的修改操作，调整可视化的宽度、字段调整、图表分析等等。● 如果你是一名java程序员，面对已经写好的python脚本该如何调用，其实很简单！点击左上角的开始按钮可以调出所有...
python中{:2d}是什么意思 python 有问必答
2021-07-07 02:04

回答 2 已采纳 d表示要输出一个整数，2表示这个整数至少要占2个字符，如果这个整数只有一位数不足2个字符，会在前面补充空格。
在 python scrapy爬虫框架：response.xpath（）的返回值是[ ],这个怎么解决？ python
2020-07-03 11:16

回答 4 已采纳考虑网页的内容使用了ajax，使用右键-》查看网页源代码，看是否仍然能获得指定的内容
请编写python程序解决以下问题： python
2020-03-03 18:29

回答 2 已采纳试试如下代码 ``` #磅和千克的互换 print('1.磅转千克\n2.千克转磅') choice=input() print('输入转换量:') inp=float(input())
1行代码实现Python数据分析：图表美观清晰，自带对比功能丨开源_python 数据可视化对比
2024-04-20 21:59

夏侯学Android的博客如果对Python感兴趣的话，可以试试我的学习方法以及相关的学习资料。
python 随机生成10个数并升序排序升序排序的那部分代码不太明白想知道是怎么实现的 python
2022-03-23 23:55

回答 2 已采纳【若能有所悟，望给个采纳，谢谢】1、enumerate：这个是与for循环配合实用，用于遍历数据对象（列表、元组或字符串均可）2、for i,j enumerate(x)这里的i=指的是数组的下标值，
一文带你读懂PyQt：用Python做出与C++一样的GUI界面应用程序
2021-05-09 16:49

LaoYuanPython的博客本文介绍了Python的图形化界面应用开发工具PyQt的功能和开发框架，通过PyQt的这些重要的工具、功能和框架机制，开发人员可以设计对应的GUI图形化界面、定义不同部件的操作及响应、捕获部件或应用的消息以及实现界面...
1行代码实现Python数据分析：图表美观清晰，自带对比功能丨开源_python 数据可视化对比(1)
2024-04-23 16:37

2401_84121798的博客如果对Python感兴趣的话，可以试试我的学习方法以及相关的学习资料。
没有解决我的问题, 去提问

悬赏问题

¥50 永磁型步进电机PID算法
¥15 sqlite 附加（attach database）加密数据库时，返回26是什么原因呢？
¥88 找成都本地经验丰富懂小程序开发的技术大咖
¥15 如何处理复杂数据表格的除法运算
¥15 如何用stc8h1k08的片子做485数据透传的功能？(关键词-串口)
¥15 有兄弟姐妹会用word插图功能制作类似citespace的图片吗？
¥200 uniapp长期运行卡死问题解决
¥15 latex怎么处理论文引理引用参考文献
¥15 请教：如何用postman调用本地虚拟机区块链接上的合约？
¥15 为什么使用javacv转封装rtsp为rtmp时出现如下问题：[h264 @ 000000004faf7500]no frame？