from gensim.models import word2vec
import gensim
import logging
import jieba,re,codecs
##结巴分词——添加新字典
jieba.load_userdict("E:/workplace/data/userdict.txt")
test=open("E:/workplace/data/test.txt",'r',encoding='Utf-8')
words=list(jieba.cut(test,cut_all=False,HMM=True))
#输入文本 是否为全模式分词 与是否开启HMM进行中文分词
words= ''.join(words)#将列表转化为字符串
报错:
warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Traceback (most recent call last):
File "<ipython-input-17-a64173a4fbe2>", line 1, in <module>
runfile('E:/workplace/code/untitled0.py', wdir='E:/workplace/code')
File "D:\Program Files (x86)\anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 866, in runfile
execfile(filename, namespace)
File "D:\Program Files (x86)\anaconda\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "E:/workplace/code/untitled0.py", line 15, in <module>
words=list(jieba.cut(test,cut_all=False,HMM=True))
File "D:\Program Files (x86)\anaconda\lib\site-packages\jieba\__init__.py", line 282, in cut
sentence = strdecode(sentence)
File "D:\Program Files (x86)\anaconda\lib\site-packages\jieba\_compat.py", line 37, in strdecode
sentence = sentence.decode('utf-8')
AttributeError: '_io.TextIOWrapper' object has no attribute 'decode'
test.txt和userdict.txt均使用utf-8编码。
test.txt内容如下: