其实我是先用第一段程序生成日期格式表,
再用第二段程序利用这个表进行筛选
“打开CSV文件并清洗数据
思路如下:
打开csv文件,获取保存的弹幕文本
清除无意义的标点符号,这里可以自己进行相应的设置
利用字典记录弹幕出现的次数
根据弹幕次数排序字典
每次写入次数最多的十条弹幕信息到另外的CSV文件中”
import datetime
from scipy.io import savemat
def create_assist_date(datestart=None, dateend=None):
# 创建日期辅助表
if datestart is None:
datestart = '2016-01-01'
if dateend is None:
dateend = datetime.datetime.now().strftime('%Y-%m-%d')
# 转为日期格式
datestart = datetime.datetime.strptime(datestart, '%Y-%m-%d')
dateend = datetime.datetime.strptime(dateend, '%Y-%m-%d')
date_list = [datestart.strftime('%Y-%m-%d')]
while datestart < dateend:
# 日期叠加一天
datestart += datetime.timedelta(days=+1)
# 日期转字符串存入列表
date_list.append(datestart.strftime('%Y-%m-%d'))
return date_list
date_list就是我说的日期辅助表,
for date in date_list
从这里获取不到
2.
import jieba
import re,string
from zhon.hanzi import punctuation
import os
import csv
danmuCount = dict()
danmuNum = 0
punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。?》《{} oh1O○〇●哈'
with open('danmuku3.csv', 'a', encoding='utf-8') as savefile:
writer = csv.writer(savefile)
writer.writerow(['name','type','value','date'])
for date in date_list:
with open('csv3/danmutext_'+ date + '.csv', 'r', encoding='utf-8') as csvfile:
print('---分析日期', date, '弹幕...\n')
reader = csv.reader(csvfile)
for line in reader:
danmuNum = danmuNum + 1
line = "".join(line)
line = re.sub(r"[%s]+" % punc, "", line)
# words_list = jieba.lcut(line)
# for word in words_list:
# data[line] = data[line] + 1
# line = line.lower()
if len(line) >= 2 and len(line) <= 15:
if danmuCount.get(line):
danmuCount[line] = danmuCount[line] + 1
else:
danmuCount[line] = 1
sortList = sorted(danmuCount.items(), key=lambda item:item[1], reverse=True)
if len(sortList)>10:
pltLists = sortList[:10]
for plttuple in pltLists:
saveLine = []
saveLine.append(plttuple[0])
saveLine.append('Chinese')
saveLine.append(plttuple[1])
saveLine.append(date)
writer.writerow(saveLine)
原链接代码:https://blog.csdn.net/qq_36178962/article/details/108125794