题目:用jiaba给一个文件里的文本进行词性切分,然后把词性标记都输入到一个txt文件中。
出现了很多错误(在最后)
以下是源代码和我的编写思路
#coding:utf8
import re,os,jieba.posseg
def para2sentences_ccl(para): # 这个函数负责文本里面的句子按照 句末标点 分好段。
marks = re.compile(r'。|?|……|?”|。”|!”')
if marks.search(para):
sentences_with_marks = marks.split(para)
sentences = [''.join(sentence).strip() for sentence in zip(sentences_with_marks[0::2],sentences_with_marks[1::2])]
else:
sentences = [para]
new_sentences = [s for s in sentences if s is not None]
return new_sentences
def file2sentences_CCl_pos(file_path):#用来用jieba对句子进行切分,并成立新的只有词性标记的文件
with open(file_path,'r',encoding='gb18030', errors='ignore') as fo:
raw_lines = fo.readlines() #是在整合材料成一行行。
cleaned_paras = [para2sentences_ccl(para) for para in raw_lines if para is not None] #把rawlines里面的句子按照para2sentence函数用句末标点分割。
seg_sentences = [jieba.posseg.lcut(sentence)for sentence in cleaned_paras]#把上一句里面的句子用jieba切分成一对对pair
flag_sentence = seg_sentences[1]#标出上一句切分出来的pair中的flag(词性标记)部分
names = os.path.split(file_path)#将输进来的文件路径分割成路径和文件名
txt_name = 'out'+names[1]#将上面的文件名前面加out
txt_path = file_path.join(names[0],txt_name)#即把路径和txtname文件名组合,形成一个新的文件路径以及txt文件
with open(txt_path,'r',-1,encoding='utf8')as txt:#打开这个新建文件
for flag in flag_sentence:#循环输出flag_sentences里面的flag
txt.writelines(flag + '\n')#每个flag用回车分成一段段
return flag_sentence
file2sentences_CCl_pos(r'D:\Users\DELL\Desktop\从普通女孩到银行家.txt')
以下是出现的错误:
Traceback (most recent call last):
File "C:/Users/DELL/PycharmProjects/111/homework/第五次作业.py", line 31, in <module>
file2sentences_CCl_pos(r'D:\Users\DELL\Desktop\从普通女孩到银行家.txt')
File "C:/Users/DELL/PycharmProjects/111/homework/第五次作业.py", line 20, in file2sentences_CCl_pos
seg_sentences = [jieba.posseg.lcut(sentence)for sentence in cleaned_paras]
File "C:/Users/DELL/PycharmProjects/111/homework/第五次作业.py", line 20, in <listcomp>
seg_sentences = [jieba.posseg.lcut(sentence)for sentence in cleaned_paras]
File "C:\ProgramData\Anaconda3\lib\site-packages\jieba\posseg\__init__.py", line 310, in lcut
return list(cut(sentence, HMM))
File "C:\ProgramData\Anaconda3\lib\site-packages\jieba\posseg\__init__.py", line 294, in cut
for w in dt.cut(sentence, HMM=HMM):
File "C:\ProgramData\Anaconda3\lib\site-packages\jieba\posseg\__init__.py", line 249, in cut
for w in self.__cut_internal(sentence, HMM=HMM):
File "C:\ProgramData\Anaconda3\lib\site-packages\jieba\posseg\__init__.py", line 217, in __cut_internal
sentence = strdecode(sentence)
File "C:\ProgramData\Anaconda3\lib\site-packages\jieba\_compat.py", line 79, in strdecode
sentence = sentence.decode('utf-8')
AttributeError: 'list' object has no attribute 'decode'
######请问 为什么会出现这些错误呢,要怎么更改