使用XLnet进行文本分类,出现了下面的问题:
def get_inputs(content, tokenizer, max_len=120):
""" Gets tensors from text using the tokenizer provided"""
inps = [tokenizer.encode_plus(t, max_length=maxlen, pad_to_max_length=False, add_special_tokens=True) for t in content]
inp_tok = np.array([a['input_ids'] for a in inps])
ids = np.array([a['attention_mask'] for a in inps])
segments = np.array([a['token_type_ids'] for a in inps])
return inps, inp_tok, ids, segments
def warmup(epoch, lr):
"""Used for increasing the learning rate slowly, this tends to achieve better convergence.
However, as we are finetuning for few epoch it's not crucial.
"""
return max(lr +1e-6, 2e-5)
def plot_metrics(pred, true_labels):
"""Plots a ROC curve with the accuracy and the AUC"""
acc = accuracy_score(true_labels, np.array(pred.flatten() >= .5, dtype='int'))
fpr, tpr, thresholds = roc_curve(true_labels, pred)
auc = roc_auc_score(true_labels, pred)
fig, ax = plt.subplots(1, figsize=(8,8))
ax.plot(fpr, tpr, color='red')
ax.plot([0,1], [0,1], color='black', linestyle='--')
ax.set_title(f"AUC: {auc}\nACC: {acc}");
return fig
inps, inp_tok, ids, segments = get_inputs(x_train_text, xlnet_tokenizer)
AttributeError Traceback (most recent call last)
/tmp/ipykernel_18279/2457827602.py in <module>
----> 1 inps, inp_tok, ids, segments = get_inputs(x_train_text, xlnet_tokenizer)
/tmp/ipykernel_18279/3374919276.py in get_inputs(content, tokenizer, max_len)
1 def get_inputs(content, tokenizer, max_len=120):
2 """ Gets tensors from text using the tokenizer provided"""
----> 3 inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=False, add_special_tokens=True) for t in content]
4 inp_tok = np.array([a['input_ids'] for a in inps])
5 ids = np.array([a['attention_mask'] for a in inps])
/tmp/ipykernel_18279/3374919276.py in <listcomp>(.0)
1 def get_inputs(content, tokenizer, max_len=120):
2 """ Gets tensors from text using the tokenizer provided"""
----> 3 inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=False, add_special_tokens=True) for t in content]
4 inp_tok = np.array([a['input_ids'] for a in inps])
5 ids = np.array([a['attention_mask'] for a in inps])
AttributeError: 'NoneType' object has no attribute 'encode_plus'