程序乱码问题（自然语言处理）

运行preprocess.py文件后，这两个文件变成乱码。
import os
import json
import logging
from transformers import BertTokenizer
try:
from utils import cutSentences, commonUtils
import config
except Exception as e:
from .utils import cutSentences, commonUtils
from . import config
else:
from utils import cutSentences, commonUtils
import config

logger = logging.getLogger(name)

class InputExample:
def init(self, set_type, text, subject_labels=None, object_labels=None):
self.set_type = set_type
self.text = text
self.subject_labels = subject_labels
self.object_labels = object_labels

class BaseFeature:
def init(self, token_ids, attention_masks, token_type_ids):
# BERT 输入
self.token_ids = token_ids
self.attention_masks = attention_masks
self.token_type_ids = token_type_ids

class BertFeature(BaseFeature):
def init(self, token_ids, attention_masks, token_type_ids, labels=None):
super(BertFeature, self).init(
token_ids=token_ids,
attention_masks=attention_masks,
token_type_ids=token_type_ids)
# labels
self.labels = labels

class NerProcessor:
def init(self, cut_sent=True, cut_sent_len=256):
self.cut_sent = cut_sent
self.cut_sent_len = cut_sent_len

@staticmethod
def read_json(file_path):
    with open(file_path, encoding='utf-8') as f:
        raw_examples = json.load(f)
    return raw_examples

def get_examples(self, raw_examples, set_type):
    examples = []
    # 这里是从json数据中的字典中获取
    for i, item in enumerate(raw_examples):
        # print(i,item)
        text = item['text']
        if self.cut_sent:
            sentences = cutSentences.cut_sent_for_bert(text, self.cut_sent_len)
            start_index = 0

            for sent in sentences:
                labels = cutSentences.refactor_labels(sent, item['labels'], start_index)

                start_index += len(sent)

                examples.append(InputExample(set_type=set_type,
                                             text=sent,
                                             labels=labels))
        else:
            subject_labels = item['subject_labels']
            object_labels = item['object_labels']
            if len(subject_labels) != 0:
                subject_labels = [('subject',label[1],label[2]) for label in subject_labels]
            if len(object_labels) != 0:
                object_labels = [('object',label[1],label[2]) for label in object_labels]
            examples.append(InputExample(set_type=set_type,
                                         text=text,
                                         subject_labels=subject_labels,
                                         object_labels=object_labels))
    return examples

def convert_bert_example(ex_idx, example: InputExample, tokenizer: BertTokenizer,
max_seq_len, nerlabel2id, ent_labels):
set_type = example.set_type
raw_text = example.text
subject_entities = example.subject_labels
object_entities = example.object_labels
entities = subject_entities + object_entities
# 文本元组
callback_info = (raw_text,)
# 标签字典
callback_labels = {x: [] for x in ent_labels}
# _label:实体类别实体名实体起始位置
for _label in entities:
# print(_label)
callback_labels[_label[0]].append((_label[0], _label[1]))

callback_info += (callback_labels,)
# 序列标注任务 BERT 分词器可能会导致标注偏
# tokens = commonUtils.fine_grade_tokenize(raw_text, tokenizer)
tokens = [i for i in raw_text]

assert len(tokens) == len(raw_text)

label_ids = None

# information for dev callback
# ========================
label_ids = [0] * len(tokens)

# tag labels  ent ex. (T1, DRUG_DOSAGE, 447, 450, 小蜜丸)
for ent in entities:
    
    # ent: ('PER', '陈元', 0)
    ent_type = ent[0] # 类别

    ent_start = ent[-1] # 起始位置
    ent_end = ent_start + len(ent[1]) - 1

    if ent_start == ent_end:
        label_ids[ent_start] = nerlabel2id['B-' + ent_type]
    else:
      try:
        label_ids[ent_start] = nerlabel2id['B-' + ent_type]
        label_ids[ent_end] = nerlabel2id['I-' + ent_type]
        for i in range(ent_start + 1, ent_end):
            label_ids[i] = nerlabel2id['I-' + ent_type]
      except Exception as e:
        print(ent)
        print(tokens)
        import sys
        sys.exit(0)


if len(label_ids) > max_seq_len - 2:
    label_ids = label_ids[:max_seq_len - 2]

label_ids = [0] + label_ids + [0]

# pad
if len(label_ids) < max_seq_len:
    pad_length = max_seq_len - len(label_ids)
    label_ids = label_ids + [0] * pad_length  # CLS SEP PAD label都为O

assert len(label_ids) == max_seq_len, f'{len(label_ids)}'
# ========================
encode_dict = tokenizer.encode_plus(text=tokens,
                                    max_length=max_seq_len,
                                    padding='max_length',
                                    truncation='longest_first',
                                    return_token_type_ids=True,
                                    return_attention_mask=True)
tokens = ['[CLS]'] + tokens + ['[SEP]']
token_ids = encode_dict['input_ids']
attention_masks = encode_dict['attention_mask']
token_type_ids = encode_dict['token_type_ids']

if ex_idx < 3:
    logger.info(f"*** {set_type}_example-{ex_idx} ***")
    print(tokenizer.decode(token_ids[:len(raw_text)]))
    logger.info(f'text: {" ".join(tokens)}')
    logger.info(f"token_ids: {token_ids}")
    logger.info(f"attention_masks: {attention_masks}")
    logger.info(f"token_type_ids: {token_type_ids}")
    logger.info(f"labels: {label_ids}")
    logger.info('length: ' + str(len(token_ids)))
    # for word, token, attn, label in zip(tokens, token_ids, attention_masks, label_ids):
    #   print(word + ' ' + str(token) + ' ' + str(attn) + ' ' + str(label))
feature = BertFeature(
    # bert inputs
    token_ids=token_ids,
    attention_masks=attention_masks,
    token_type_ids=token_type_ids,
    labels=label_ids,
)

return feature, callback_info

def convert_examples_to_features(examples, max_seq_len, bert_dir, nerlabel2id, ent_labels):
tokenizer = BertTokenizer(os.path.join(bert_dir, 'vocab.txt'))
features = []
callback_info = []

logger.info(f'Convert {len(examples)} examples to features')

for i, example in enumerate(examples):
    """
    subject_entities = example.subject_labels
    object_entities = example.object_labels
    entities = subject_entities + object_entities
    flag = False
    for ent in entities:
        start_id = ent[1]
        end_id = len(ent[0]) + ent[1]
        if start_id >= max_seq_len - 2 or end_id >= max_seq_len - 2:
            flag = True
            break
    if flag:
        continue
    """
    feature, tmp_callback = convert_bert_example(
        ex_idx=i,
        example=example,
        max_seq_len=max_seq_len,
        nerlabel2id=nerlabel2id,
        tokenizer=tokenizer,
        ent_labels=ent_labels,
    )
    if feature is None:
        continue
    features.append(feature)
    callback_info.append(tmp_callback)
logger.info(f'Build {len(features)} features')

out = (features,)

if not len(callback_info):
    return out

out += (callback_info,)
return out

def get_data(processor, raw_data_path, json_file, mode, nerlabel2id, ent_slabels, args):
raw_examples = processor.read_json(os.path.join(raw_data_path, json_file))
examples = processor.get_examples(raw_examples, mode)
data = convert_examples_to_features(examples, args.max_seq_len, args.bert_dir, nerlabel2id, ent_labels)
save_path = os.path.join(args.data_dir, 'ner_final_data')
if not os.path.exists(save_path):
os.makedirs(save_path)
commonUtils.save_pkl(save_path, data, mode)
return data

def save_file(filename, data, id2nerlabel):
features, callback_info = data
file = open(filename,'w',encoding='utf-8')
for feature,tmp_callback in zip(features, callback_info):
text, gt_entities = tmp_callback
for word, label in zip(text, feature.labels[1:len(text)+1]):
file.write(word + ' ' + id2nerlabel[label] + '\n')
file.write('\n')
file.close()

if name == 'main':

dataset = "dgre"
args = config.Args().get_parser()
args.bert_dir = '../model_hub/chinese-roberta-wwm-ext/'
commonUtils.set_logger(os.path.join(args.log_dir, 'preprocess.log'))

if dataset == "dgre":
    args.data_dir = '../data/dgre/'
    args.max_seq_len = 512
elif dataset == "duie":
    args.data_dir = '../data/'
    args.max_seq_len = 300

mid_data_path = os.path.join(args.data_dir, 'mid_data')

# 真实标签
ent_labels_path = mid_data_path + '/ent_labels.txt'
# 序列标注标签B I O
ner_labels_path = mid_data_path + '/ner_labels.txt'
with open(ent_labels_path, 'r',encoding='utf-8') as fp:
    ent_labels = fp.read().strip().split('\n')
entlabel2id = {}
id2entlabel = {}
for i,j in enumerate(ent_labels):
  entlabel2id[j] = i
  id2entlabel[i] = j
nerlabel2id = {}
id2nerlabel = {}
with open(ner_labels_path,'r',encoding='utf-8') as fp:
    ner_labels = fp.read().strip().split('\n')
for i,j in enumerate(ner_labels):
  nerlabel2id[j] = i
  id2nerlabel[i] = j

processor = NerProcessor(cut_sent=False, cut_sent_len=args.max_seq_len)

train_data = get_data(processor, mid_data_path, "train.json", "train", nerlabel2id, ent_labels, args)
save_file(os.path.join(args.data_dir,"{}_{}_cut.txt".format(dataset, args.max_seq_len)), train_data, id2nerlabel)
dev_data = get_data(processor, mid_data_path, "dev.json", "dev", nerlabel2id, ent_labels, args)

乱码文件的截图

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

1条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
尔嵘博客专家认证 2022-11-22 17:21
关注
是不是电脑有加密软件啊

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

.net 调用一般处理程序传值变乱码
2017-11-17 05:35

回答 1 已采纳如果你的参数中有中文或者符号，js端用urlencode编码下。
关于报错信息乱码的问题：ASP程序浏览器报错信息乱码开发语言有问必答
2021-08-05 10:24

回答 2 已采纳你asp存储的是utf-8编码吧，注意是存储编码，和页面meta声明无关，乱码是中文系统的asp错误信息是gb2312编码（文件存储编码为ansi），如果你asp存储为ansi编码，使用gb2312编
程序结果出现中文乱码问题 c语言
2022-06-10 19:09

回答 5 已采纳 C语言里面乱码的情况还是比较多的，你是不是创建了一个文件，然后从文件读取里面的内容的时候出现了乱码
Python NLP自然语言处理详解
2022-10-18 13:39

wespten的博客在这个大数据时代，几乎所有事物都能用数据描述。数据可以大致分为三类。第一类是用于传播的媒体数据...第三类是自然语言数据。这类数据更贴近生活，对其进行统计和分析，可以让机器理解人的语言，实现机器与人的交流。
关于python乱码处理的问题 python
2021-08-06 22:48

回答 4 已采纳将gbk编码改为UTF-8编码，如下面的代码，将s的gbk编码方式改为UTF-8编码方式:s.decode(‘gbk’).encode(‘utf－8’)
天问block中出现乱码警告框自然语言处理语言模型语音识别
2022-05-20 11:50

回答 1 已采纳只是弹窗里面显示乱码吗？解决方法如果这样的话，1、确保安装程序在英文路径下而不是中文路径，如果在中文路径下安装，在其他路径下重新安装2、另外我看你windows 下面的搜索框里面显示的也是这种语言
浏览器缓存，js文件乱码问题 javascript 开发语言
2022-06-21 16:26

回答 1 已采纳改完utf-8，清浏览器缓存；浏览器会自动缓存js文件，图片等静态文件
【自然语言处理工具箱 LTP 】pyltp 使用教程
2021-12-31 22:02

chaser&upper的博客【自然语言处理工具箱 LTP 】pyltp 使用使用前请先下载完整模型请注意编码分句分词使用分词外部词典词性标注命名实体识别 pyltp 是 LTP 的 Python 封装，提供了分词，词性标注，命名实体识别，依存句法分析，语义...
汇编语言输出字符串乱码？？
2015-10-31 10:24

回答 1 已采纳汇编语言输入字符串并输出所有非字母字符的前后导字符----------------------同志你好，我是CSDN问答机器人小N，奉组织之命为你提供参考答案，编程尚未成功，同志仍需努力！
R语言数字乱码怎么纠正？ python r语言
2023-03-14 11:03

回答 2 已采纳这个不是乱码，是科学计数法。像图一第一个lstat是0.5347。默认情况下，当数字大于10的6次方或小于10的负5次方时，R会使用科学计数法表示数字。硬要取消的话 #取消科学计数法 options(
r语言导入txt文件中文出现乱码是什么原因 r语言
2023-03-06 13:04

回答 1 已采纳将txt文件另存为UTF8编码
解决Macos中Python Matplotlib库中文显示乱码问题
2021-08-17 20:40

海轰Pro的博客标签：程序猿｜C++选手｜学生简介：因C语言结识编程，随后转入计算机专业，有幸拿过一些国奖、省奖…已保研。目前正在学习C++/Linux/Python 学习经验：扎实基础 + 多做笔记 + 多敲代码 + 多思考 + 学好英语！ &nbsp...
Windows11 sqlplus中文乱码问题 database oracle sql
2022-04-28 13:14

回答 1 已采纳在oracle数据库中，任何以select形式查到的字符集，都是客户端字符集，用select是查不到服务端字符集的(网上很多文章都是错的)。对于oracle数据库而言，查询显示乱码，是由于客户端字符集
计算机打开程序乱码,win7电脑打开软件乱码的处理操作
2021-07-14 10:59

Fetch_ai的博客 win7电脑打开软件乱码的处理操作方法一：修改区域语言1、修改区域语言，是一种比较常用的方法，先从开始菜单中打开控制面板，把视图查看方式设置为“大图标”，接着找到“区域和语言”这一项。如图所示：2、进入区域...
基于人工智能的图像处理技术：利用Opencv实现
2022-05-03 22:30

谨慎谦虚的博客基于人工智能的图像处理技术本文档基于电子科技大学软件工程学院的的一门图像处理技术课程要求所撰写，希望后来的学习学妹！！！！不要照抄！！！！！ 人工智能概述 人工智能，作为计算机科学的一个重要分支...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 11月15日

悬赏问题

¥30 这是哪个作者做的宝宝起名网站
¥60 版本过低apk如何修改可以兼容新的安卓系统
¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
¥50 有数据，怎么建立模型求影响全要素生产率的因素
¥50 有数据，怎么用matlab求全要素生产率
¥15 TI的insta-spin例程
¥15 完成下列问题完成下列问题
¥15 C#算法问题, 不知道怎么处理这个数据的转换
¥15 YoloV5 第三方库的版本对照问题
¥15 请完成下列相关问题！

程序乱码问题（自然语言处理）

乱码文件的截图

1条回答 默认 最新

问题事件

悬赏问题

1条回答默认最新