# -*- coding: utf-8 -*-
import codecs
import xlrd2
def read_dic(dic_path):
words_dic=xlrd2.open_workbook(dic_path)
#打开工作簿,也就是词表
sheet=words_dic.sheets()[0]
dic_list=list(sheet.col_values(1))[1:]
#通过遍历找出每一列的数据,括号里面的代表只取第几列的值
return dic_list
#完成字典词表的输入
def read_file(file_path):
with codecs.open(file_path,"r",encoding="utf8")as f:
file=f.readlines()
return file
#完成被处理的文件的输入
def cut_word(raw_sentences,word_dic):
#设一个切词的函数
max_length=max(len(word) for word in word_dic)
new_cut=[]
#建一个新的空白集合,留着装新切的词
for sentence in raw_sentences:
sentence=sentence.strip()
#把句子两边的空格都删掉
sent_length=len(sentence)
cut_word_list=[]
while sent_length>0:
cut_length=(min(sent_length,max_length))
for i in range(cut_length,0,-1):
#从0到cut_length的范围,步数为-1,即每次都减少1
new_word = [sent_length]
if new_word in word_dic:
new_cut.append(new_word)#如果切的新词在词典里面有,就把它放进新集合里
cut_length=cut_length-i
break
elif i==1:
new_cut.append(new_word)
sent_length=sent_length-1
words='/'.join(str(cut_word_list))
new_cut.append(words.lstrip('/'))
return new_cut
def out_file(out_path,sentences):
with codecs.open(out_path,'a','utf8')as f:
for sentence in sentences:
f.write(sentence)
def main():
file_path=r"C:\Users\Apple\Desktop\7.txt"
file=read_file(file_path)
dic_path=r"C:\Users\Apple\Desktop\词表.xlsx"
words_dic=read_dic(dic_path)
content_cut=cut_word(file,words_dic)
out_path=r"C:\Users\Apple\Desktop\分词结果.txt"
out_file(out_path,content_cut)
if __name__ == '__main__':
main()
File "C:/Users/Apple/AppData/Roaming/JetBrains/PyCharmCE2020.3/scratches/正向最大匹配运算.py", line 33, in cut_word
if new_word in word_dic:
TypeError: 'in <string>' requires string as left operand, not list
请教大家,出现这种报错怎么办呀