UnicodeDecodeError: 'gbk' codec can't decode byte 0x93 in position 596: illegal multibyte sequence
if __name__ == '__main__':
ws = Word2Sequence()
path = r"D:\data\Desktop\aclImdb_v1.tar\aclImdb_v1\aclImdb\train"
temp_data_path = [os.path.join(path,"pos"),os.path.join(path,"neg")]
for data_path in temp_data_path:
file_paths = [os.path.join(data_path,file_name) for file_name in os.listdir(data_path) if file_name.endswith("txt")]
for file_path in tqdm(file_paths):
sentence = tokenlize(open(file_path).read())
ws.fit(sentence)
ws.build_vocab(min=10,max_feature=5000)
pickle.dump(ws, open("../pythonProject/ws.pkl",'rb'))
print(len(ws))
按照网上的解答,在open()里加入encoding
pickle.dump(ws, open("../pythonProject/ws.pkl",'rb',encoding='utf-8'))
但不论是加入encoding='utf-8'还是再加一个error='ignore'
依然报相同的错误
是不是环境的问题啊?