import jieba
file_userDict = 'add_word_list.txt'
jieba.load_userdict(file_userDict)
import re
stop_file = "stopwordlist.txt"
try:
stopword_list = open(stop_file,encoding ='utf-8')
except:
stopword_list = []
print("error in stop_file")
stop_list = []
for line in stopword_list:
line = re.sub(u'\n|\\r', '', line)
stop_list.append(line)
f=open("法律(新)\服务贸易\中华人民共和国海南自由贸易港法(FBM-CLI.1.5015177).txt","r",encoding='utf-8')
txt=f.read()
words = jieba.lcut(txt)
counts = {}
for word in words:
if len(word) ==1:
continue
else:
counts[word] = counts.get(word,0) + 1
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
items
我想做个先去除停用词然后对文本进行分词并词频统计的代码,然而这个代码最后得出的词频统计并没有去除停用词,请问该怎么修改呢?