yingxiong0523
2022-03-29 11:39
采纳率: 50%
浏览 33
已结题

python读取csv文件遇到编码问题(自己批量生成多条摘要形成的csv)

问题遇到的现象和发生背景 已经从多个text中、使用textrank选取了摘要、形成了一个csv文件。但再读取这个csv的时候一直报错:
从text批量生成摘要csv的代码:
import sys
from imp import reload
import os
 
try:
    reload(sys)
    sys.setdefaultencoding('utf-8')
except:
    pass
 
import codecs
from textrank4zh import TextRank4Keyword, TextRank4Sentence
 
def work(file):
    # 打开并读取文本文件
 
    text = codecs.open(file, 'r', 'utf-8').read()
 
    # 创建分词类的实例
 
    tr4w = TextRank4Keyword()
 
    # 对文本进行分析,设定窗口大小为2,并将英文单词小写
 
    tr4w.analyze(text=text, lower=True, window=2)
 
    """输出"""
 
    print('关键词为:')
 
    # 从关键词列表中获取前20个关键词
 
    for item in tr4w.get_keywords(num=20, word_min_len=1):
        print(item.word, item.weight)
        print('\n')
        print('关键短语为:')
    # 从关键短语列表中获取关键短语
 
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        print(phrase)
        print('\n')
    # 创建分句类的实例
 
    tr4s = TextRank4Sentence()
 
    # 英文单词小写,进行词性过滤并剔除停用词
 
    tr4s.analyze(text=text, lower=True, source='no_filter')
 
    print('摘要为:')
 
    # 抽取1条句子作为摘要
    zy = []
 
    for item in tr4s.get_key_sentences(num=3):
        # 打印句子的索引、权重和内容
        print(item.index, item.weight, item.sentence)
        zy.append("{} , {} , {}".format(item.index, item.weight, item.sentence))
    # 这里返回摘要信息
    return "\n".join(zy)
 
def check_all_files(check_path):
    list_files = []
    # 列出文件夹下所有文件
    cur_list = os.listdir(check_path)
    for i in range(0 ,len(cur_list)):
        file_path = os.path.join(check_path, cur_list[i])
        if os.path.isfile(file_path):
            if cur_list[i].upper()[-4:]=='.TXT':
                list_files.append([cur_list[i], file_path])
    return list_files
 
 
f_lst = check_all_files(r"/Users/xiongying/Desktop/测试3")
with open("1983ceshi.csv", 'a') as csv_file:
    for f in f_lst:
        print("处理",f[0])
        # 写入摘要文件
        csv_file.write("{}\n".format(work(f[1])))

读取csv的代码:
data = pd.read_csv('/Users/xiongying/Desktop/测试3/1983ceshi.csv',encoding ='utf-8')  # 读取训练数据
data=data.astype(str)#转化为字符型用于后续分词和建模

data

这一步就报错了:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-95-7d137b2fe881> in <module>
      1 # 测试-读取数据
      2 data = pd.read_csv('/Users/xiongying/Desktop/测试3/1983ceshi.csv',encoding ='utf-8')  # 读取训练数据
----> 3 data=data.astype(str)#转化为字符型用于后续分词和建模
      4 
      5 data

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5875         else:
   5876             # else, only a single dtype is given
-> 5877             new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
   5878             return self._constructor(new_data).__finalize__(self, method="astype")
   5879 

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    629         self, dtype, copy: bool = False, errors: str = "raise"
    630     ) -> "BlockManager":
--> 631         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    632 
    633     def convert(

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
    425                     applied = b.apply(f, **kwargs)
    426                 else:
--> 427                     applied = getattr(b, f)(**kwargs)
    428             except (TypeError, NotImplementedError):
    429                 if not ignore_failures:

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    624 
    625         if dtype is not None:
--> 626             dtype = pandas_dtype(dtype)
    627 
    628         # may need to convert to categorical

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/common.py in pandas_dtype(dtype)
   1797     # raise a consistent TypeError if failed
   1798     try:
-> 1799         npdtype = np.dtype(dtype)
   1800     except SyntaxError as err:
   1801         # np.dtype uses `eval` which can raise SyntaxError

TypeError: data type '正确认识社会产品、国民收入的生产、分配和使用_钱伯海.txt' not understood

2条回答 默认 最新

相关推荐 更多相似问题