
数据都已经处理好,就差一个对DataFrame的循环,d2=data.loc[data['类别']=='娱乐'],循环内容,对它进行切词,应该怎么办
data={
'类别':['体育','娱乐'],
'内容':['鲍勃库西奖归谁属? NCAA最强控卫是坎巴还是弗神新浪体育讯如今,本赛季的NCAA进入到','《翻滚吧阿信》台北首映 彭于晏获封准影帝(图)新浪娱乐讯 昨日(8月11日),电影《翻滚吧']
}
data=DataFrame(data)
data
d2=data.loc[data['类别']=='娱乐']
cut_d2 = jieba.lcut(str(d2['内容'].values),cut_all=False)
file_path=open(r'stop_words.txt',encoding='utf-8')
stop_words = file_path.read()
new_cut2 = []
for word in cut_d2:
if word not in stop_words:
new_cut2.append(word)
new_cut2=[name for name in new_cut2 if len(name) > 1]
#计算频次
wordsDict = {} #新建字典用于储存词及词频
for word in new_cut2:
if len(word) == 1: #单个的字符不作为词放入字典
continue
else:
wordsDict.setdefault(word, 0) #设置词的初始出现次数为0
wordsDict[word] +=1 #对于重复出现的词,每出现一次,次数增加1
wordsDict_seq = sorted(wordsDict.items(),key=lambda x:x[1], reverse=True) #按字典的值降序排序
wordsDict_seq=wordsDict_seq[:300]
dd2= pd.DataFrame(list(wordsDict_seq))
dd2.columns = ['词','频次']
import numpy as np
c2=np.array(dd2["词"])
list2 = c2.tolist()
list2