jieba分词后,想要对分词进行词频统计和排序,使用print函数输出的是分词遍历的结果。
def chinese_word_cut(mytext):
jieba.load_userdict(dic_file)
jieba.initialize()
try:
stopword_list = open(stop_file, encoding='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
flag_list = ['n', 'nz', 'vn']
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
# jieba分词
seg_list = psg.cut(mytext)
word_list = []
word_freq = {}
count_dict = dict()
for seg_word in seg_list:
# word = re.sub(u'[^\u4e00-\u9fa5]','',seg_word.word)
word = seg_word.word
find = 0
for stop_word in stop_list:
if stop_word == word or len(word) < 2: # this word is stopword
find = 1
break
if find == 0 and seg_word.flag in flag_list:
for word in word_list:
if word in word_freq:
count_dict[word] += 1
else:
count_dict[word] = 1
word_list.append(word)
print(sorted(count_dict.items(),key=lambda x:x[1],reverse=True))
运行显示的是分词遍历的结果:[('文化', 1)] [('标准', 1)] [('印发', 1)]
我尝试过把for word in word_freq 等循环语句放在不同的位置,但是显示结果都是分词的遍历
想要最终显示出[(key,value)],value是key的总计值,而非是“1”。