zhuzhuzhanshi 2019-05-02 11:17 采纳率: 0%

已结题

python运行有错误：这是对数据进行分析生成可视化界面的程序(我是小白，请说下解决方法)

运行错误：
C:\Users\Administrator\PycharmProjects\untitled\venv\Scripts\python.exe C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py
一共有：16590个
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.808 seconds.
Prefix dict has been built succesfully.
Traceback (most recent call last):
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 252, in
jiebaclearText(content)
File "C:/Users/Administrator/PycharmProjects/untitled/dianying/src/analysis_data.py", line 97, in jiebaclearText
f_stop_text = f_stop.read()
File "D:\python111\lib\codecs.py", line 321, in decode
(result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 3: invalid start byte

Process finished with exit code 1

代码如下：
'''
data : 2019.3.28
goal : 可视化分析获取到的数据
'''
import csv

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''

读数据

def read_csv():
content = ''
# 读取文件内容
with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:
# 读文件
reader = csv.reader(file_test)
i = 0
for row in reader:
if i != 0:
time.append(row[0])
nickName.append(row[1])
gender.append(row[2])
cityName.append(row[3])
userLevel.append(row[4])
score.append(row[5])
content = content + row[6]
# print(row)
i = i + 1
print('一共有：' + str(i - 1) + '个')
return content

import re, jieba

词云生成工具

from wordcloud import WordCloud, ImageColorGenerator

需要对中文进行处理

import matplotlib.font_manager as fm
from pylab import *

mpl.rcParams['font.sans-serif'] = ['SimHei']
from os import path

d = path.dirname(__file__)

stopwords_path = 'D:\ku\chineseStopWords.txt'

评论词云分析

def word_cloud(content):
import jieba, re, numpy
from pyecharts import WordCloud
import pandas as pd

# 去除所有评论里多余的字符
content = content.replace(" ", ",")
content = content.replace(" ", "、")
content = re.sub('[,，。. \r\n]', '', content)

segment = jieba.lcut(content)
words_df = pd.DataFrame({'segment': segment})
# quoting=3 表示stopwords.txt里的内容全部不引用
stopwords = pd.read_csv(stopwords_path, index_col=False, quoting=3, sep="\t", names=['stopword'],
                        encoding='utf-8')
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
words_stat = words_df.groupby(by=['segment'])['segment'].agg({"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
test = words_stat.head(500).values
codes = [test[i][0] for i in range(0, len(test))]
counts = [test[i][1] for i in range(0, len(test))]
wordcloud = WordCloud(width=1300, height=620)
wordcloud.add("影评词云", codes, counts, word_size_range=[20, 100])
wordcloud.render(d + "\picture\c_wordcloud.html")

定义个函数式用于分词

def jiebaclearText(text):
# 定义一个空的列表，将去除的停用词的分词保存
mywordList = []
text = re.sub('[,，。. \r\n]', '', text)
# 进行分词
seg_list = jieba.cut(text, cut_all=False)
# 将一个generator的内容用/连接
listStr = '/'.join(seg_list)
listStr = listStr.replace("class", "")
listStr = listStr.replace("span", "")
listStr = listStr.replace("悲伤逆流成河", "")
# 打开停用词表
f_stop = open(stopwords_path, encoding="utf8")
# 读取
try:
f_stop_text = f_stop.read()
finally:
f_stop.close() # 关闭资源
# 将停用词格式化，用\n分开，返回一个列表
f_stop_seg_list = f_stop_text.split("\n")
# 对默认模式分词的进行遍历，去除停用词
for myword in listStr.split('/'):
# 去除停用词
if not (myword.split()) in f_stop_seg_list and len(myword.strip()) > 1:
mywordList.append(myword)
return ' '.join(mywordList)

生成词云图

def make_wordcloud(text1):
text1 = text1.replace("悲伤逆流成河", "")
bg = plt.imread(d + "/static/znn1.jpg")
# 生成
wc = WordCloud( # FFFAE3
background_color="white", # 设置背景为白色，默认为黑色
width=890, # 设置图片的宽度
height=600, # 设置图片的高度
mask=bg,
# margin=10, # 设置图片的边缘
max_font_size=150, # 显示的最大的字体大小
random_state=50, # 为每个单词返回一个PIL颜色
font_path=d + '/static/simkai.ttf' # 中文处理，用系统自带的字体
).generate_from_text(text1)
# 为图片设置字体
my_font = fm.FontProperties(fname=d + '/static/simkai.ttf')
# 图片背景
bg_color = ImageColorGenerator(bg)
# 开始画图
plt.imshow(wc.recolor(color_func=bg_color))
# 为云图去掉坐标轴
plt.axis("off")
# 画云图，显示
# 保存云图
wc.to_file(d + r"/picture/word_cloud.png")

评论者性别分布可视化

def sex_distribution(gender):
# print(gender)
from pyecharts import Pie
list_num = []
list_num.append(gender.count('0')) # 未知
list_num.append(gender.count('1')) # 男
list_num.append(gender.count('2')) # 女
attr = ["其他", "男", "女"]
pie = Pie("性别饼图")
pie.add("", attr, list_num, is_label_show=True)
pie.render(d + r"\picture\sex_pie.html")

评论者所在城市分布可视化

def city_distribution(cityName):
city_list = list(set(cityName))
city_dict = {city_list[i]: 0 for i in range(len(city_list))}
for i in range(len(city_list)):
city_dict[city_list[i]] = cityName.count(city_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
city_name = []
city_num = []
for i in range(len(sort_dict)):
city_name.append(sort_dict[i][0])
city_num.append(sort_dict[i][1])

import random
from pyecharts import Bar
bar = Bar("评论者城市分布")
bar.add("", city_name, city_num, is_label_show=True, is_datazoom_show=True)
bar.render(d + r"\picture\city_bar.html")

每日评论总数可视化分析

def time_num_visualization(time):
from pyecharts import Line
time_list = list(set(time))
time_dict = {time_list[i]: 0 for i in range(len(time_list))}
time_num = []
for i in range(len(time_list)):
time_dict[time_list[i]] = time.count(time_list[i])
# 根据数量(字典的键值)排序
sort_dict = sorted(time_dict.items(), key=lambda d: d[0], reverse=False)
time_name = []
time_num = []
print(sort_dict)
for i in range(len(sort_dict)):
time_name.append(sort_dict[i][0])
time_num.append(sort_dict[i][1])

line = Line("评论数量日期折线图")
line.add(
    "日期-评论数",
    time_name,
    time_num,
    is_fill=True,
    area_color="#000",
    area_opacity=0.3,
    is_smooth=True,
)
line.render(d + r"\picture\c_num_line.html")

评论者猫眼等级、评分可视化

def level_score_visualization(userLevel, score):
from pyecharts import Pie
userLevel_list = list(set(userLevel))
userLevel_num = []
for i in range(len(userLevel_list)):
userLevel_num.append(userLevel.count(userLevel_list[i]))

score_list = list(set(score))
score_num = []
for i in range(len(score_list)):
    score_num.append(score.count(score_list[i]))

pie01 = Pie("等级环状饼图", title_pos='center', width=900)
pie01.add(
    "等级",
    userLevel_list,
    userLevel_num,
    radius=[40, 75],
    label_text_color=None,
    is_label_show=True,
    legend_orient="vertical",
    legend_pos="left",
)
pie01.render(d + r"\picture\level_pie.html")
pie02 = Pie("评分玫瑰饼图", title_pos='center', width=900)
pie02.add(
    "评分",
    score_list,
    score_num,
    center=[50, 50],
    is_random=True,
    radius=[30, 75],
    rosetype="area",
    is_legend_show=False,
    is_label_show=True,
)
pie02.render(d + r"\picture\score_pie.html")

time = []
nickName = []
gender = []
cityName = []
userLevel = []
score = []
content = ''
content = read_csv()

1 词云

jiebaclearText(content)
make_wordcloud(content)

pyecharts词云

word_cloud(content)

2 性别分布

sex_distribution(gender)

3 城市分布

city_distribution(cityName)

4 评论数

time_num_visualization(time)

5 等级，评分

level_score_visualization(userLevel, score)

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

1条回答

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
wangshubin5 2019-05-02 15:03
关注
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa1 in position 0

Python 编码中编码解码的问题，我这个错误就是‘utf-8’不能解码位置0的那个字节（0xa1），也就是这个字节超出了utf-8的表示范围了

解决办法：

with open(r'D:\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:

修改：**with open(r'D:\maoyan.csv', 'r', encoding='utf-8', newline='') as file_test:**

也就是在读取数据的时候，显式添加编码方式encoding='utf_8'，别的编码也可以试试哟。如：gb18030

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

windows下bat运行python的exe程序报“Error Loading python DLL”错误 python
2022-04-26 23:33

回答 2 已采纳 1、先解决你原先报错的问题-D, --onedir 创建一个包含可执行文件的单文件夹捆绑包（默认）-F, --onefile 创建一个文件捆绑可执行文件。不加参数默认是-D，打包成一个文件
Python的可视化：类型错误：add_xaxis() 需要 2 个位置参数，但给出了 4 个 python 大数据
2021-11-16 10:22

回答 1 已采纳 .add_xaxis(['中','美','日']) 这样
python运行程序时IndexError: index 3 is out of bounds for axis 1 with size 3如何解决 python 开发语言有问必答
2022-02-28 00:31

回答 2 已采纳你train中索引index访问超出最大索引了
基于Python的中国城市轨道交通数据可视化分析源码+项目说明.zip
2023-08-10 10:56

通过这个项目可以练习使用Python数据可视化分析相关的强大的库和模块，练习绘制简单的GUI界面并且连接数据库，更加深了对Python语言的学习和拓展。本项目也可作为学校的大作业、大实验实践或者课程设计等的选题项目...
python运行问题IndexError: tuple index out of range python 有问必答
2022-03-18 17:09

回答 3 已采纳 data1.append((start_num[i], end_num[i], data[i][75]))中data[i][75]是取每行的第76个字段, 你数据库中有76个字段吗没有就是tuple
python多进程程序打包后，运行出现多个窗口，应该如果解决 python
2021-04-13 21:29

回答 1 已采纳 multiprocessing.freeze_support() 放在入口py文件（你的主程序）的if __name__=="__main__":的后面
python中{:2d}是什么意思 python 有问必答
2021-07-07 02:04

回答 2 已采纳 d表示要输出一个整数，2表示这个整数至少要占2个字符，如果这个整数只有一位数不足2个字符，会在前面补充空格。
一文带你读懂PyQt：用Python做出与C++一样的GUI界面应用程序
2021-05-09 16:49

LaoYuanPython的博客本文介绍了Python的图形化界面应用开发工具PyQt的功能和开发框架，通过PyQt的这些重要的工具、功能和框架机制，开发人员可以设计对应的GUI图形化界面、定义不同部件的操作及响应、捕获部件或应用的消息以及实现界面...
在 python scrapy爬虫框架：response.xpath（）的返回值是[ ],这个怎么解决？ python
2020-07-03 11:16

回答 4 已采纳考虑网页的内容使用了ajax，使用右键-》查看网页源代码，看是否仍然能获得指定的内容
请编写python程序解决以下问题： python
2020-03-03 18:29

回答 2 已采纳试试如下代码 ``` #磅和千克的互换 print('1.磅转千克\n2.千克转磅') choice=input() print('输入转换量:') inp=float(input())
python 随机生成10个数并升序排序升序排序的那部分代码不太明白想知道是怎么实现的 python
2022-03-23 23:55

回答 2 已采纳【若能有所悟，望给个采纳，谢谢】1、enumerate：这个是与for循环配合实用，用于遍历数据对象（列表、元组或字符串均可）2、for i,j enumerate(x)这里的i=指的是数组的下标值，
你以为学了Python就能做数据分析师，实际上是，Python模块化面向接口编程
2024-04-18 23:38

2401_84140347的博客不知道你们用的什么环境，我...Python 环境、pycharm编辑器/永久激活/翻译插件python 零基础视频教程Python 界面开发实战教程Python 爬虫实战教程Python 数据分析实战教程python 游戏开发实战教程Python 电子书100本。
如何使用python导入csv文件，并将csv中某一列的数据生成一个列表？ python 数据分析数据挖掘
2022-01-03 11:59

回答 1 已采纳是否需要对原数据进行修改呢？如果不需要，请看以下方法：将读取出来的列，再进行赋值，就可以单独形成一个列表了。如果答案，您满意，请采纳意见和点赞关注，支持一下，谢谢！
案例上手 Python 数据可视化
2019-02-27 23:30

蔚1的博客课程亮点 ...数据可视化是数据分析和机器学习的重要环节，比如数据清洗、特征工程、机器学习、数据分析（特别是报告）、评估等环节都会用到“数据可视化”技术。数据可视化同时还广泛存在于各...
2020年最值得推荐的10款数据可视化工具，看完我收藏了
2020-06-03 10:24

Leo.yuan的博客很多朋友问我可视化用的什么工具做的，今天就给大家罗列了10个最值得推荐的数据可视化工具，鉴于大家的技术水平和偏好不同，我从三个方面来讲：零编程类，操作简单，无需编程基础，适合新手小白做一些基础性的图表...
没有解决我的问题, 去提问

悬赏问题

¥15 js调用html页面需要隐藏某个按钮
¥15 ads仿真结果在圆图上是怎么读数的
¥20 Cotex M3的调试和程序执行方式是什么样的？
¥20 java项目连接sqlserver时报ssl相关错误
¥15 一道python难题3
¥15 牛顿斯科特系数表表示
¥15 arduino 步进电机
¥20 程序进入HardFault_Handler
¥15 oracle集群安装出bug
¥15 关于#python#的问题：自动化测试