在windows7 anaconda 环境下,
用text2vec 做文本相似度分析,已经安装了text2vec
运行的代码:
from torch.utils.data import Dataset
from typing import List
from ..readers.InputExample import InputExample
import numpy as np
import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
class DenoisingAutoEncoderDataset(Dataset):
"""
The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
sentence without noise.
:param sentences: A list of sentences
:param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words
"""
def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)):
self.sentences = sentences
self.noise_fn = noise_fn
def __getitem__(self, item):
sent = self.sentences[item]
return InputExample(texts=[self.noise_fn(sent), sent])
def __len__(self):
return len(self.sentences)
# Deletion noise.
@staticmethod
def delete(text, del_ratio=0.6):
words = nltk.word_tokenize(text)
n = len(words)
if n == 0:
return text
keep_or_not = np.random.rand(n) > del_ratio
if sum(keep_or_not) == 0:
keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
words_processed = TreebankWordDetokenizer().detokenize(np.array(words)[keep_or_not])
return words_processed
提示错误:
runfile('C:/Anaconda3/lib/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py', wdir='C:/Anaconda3/lib/site-packages/sentence_transformers/datasets')
Traceback (most recent call last):
File "C:\Anaconda3\lib\site-packages\sentence_transformers\datasets\DenoisingAutoEncoderDataset.py", line 1, in
from torch.utils.data import Dataset
File "C:\Anaconda3\lib\site-packages\torch_init_.py", line 98, in
os.add_dll_directory(dll_path)
File "C:\Anaconda3\lib\os.py", line 1109, in add_dll_directory
cookie = nt._add_dll_directory(path)
OSError: [WinError 127] 找不到指定的程序。: 'C:\Anaconda3\lib\site-packages\torch\lib'