python数据导入报错FileNotFoundError: [Errno 2] No such file or directory

main_rcv1：


import numpy as np
from scipy.io import savemat
import os
import data

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#%%
class_0, class_1 = ['C22'], ['C23']
categories = sorted(class_0 + class_1)

sparsity_b1, sparsity_b2 = 0.1, 0.002
sparsity_b1, sparsity_b2 = 0.01, 0.001
num_words = 30

save_to = f'../matlab/data/rcv1/data.mat'
print(save_to)

#%%
dataset = data.TextRCV1(data_dir='./data/RCV1', subset='all', categories=categories)
dataset.clean_text(num='substitute')
dataset.vectorize(stop_words='english')
dataset.remove_short_documents(nwords=50, vocab='full')

dataset.remove_encoded_images()
dataset.remove_frequent_words(sparsity_b1=sparsity_b1, sparsity_b2=sparsity_b2)
dataset.keep_top_words(num_words, 0)
dataset.remove_short_documents(nwords=5, vocab='selected')

dataset.compute_tfidf()
dataset.data_info(show_classes=True)

tfidf = dataset.tfidf.astype(float).T.toarray()  # size: (num of words) x (num of documents)
print('max/min tfidf', np.max(tfidf), np.min(tfidf[tfidf > 0]))

card = np.sum(tfidf > 0, 1)
print('max/min edge cardinality', max(card), min(card))

#%%
index2class = {i: dataset.class_names[i] for i in range(len(dataset.class_names))}
true_classes = np.array([int(index2class[i] in class_1) for i in dataset.labels])

#%%
data_mat = {'R': tfidf, 'y': true_classes}
savemat(save_to, data_mat)

data：


import re
import sklearn
from sklearn import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import os
from tqdm.auto import tqdm, trange
from scipy.sparse import csr_matrix
import random

def balance_class_idx(labels, n_per_class=None):

    lset = np.unique(labels)
    num_class = [np.sum(labels==ll) for ll in lset]
    small_num = np.min(num_class)
    n_per_class = small_num if n_per_class is None else n_per_class

    lim_lo = min(small_num, n_per_class)

    idx = np.arange(len(labels))
    idx_sample = []
    for li,ll in enumerate(lset):
        bi = labels==ll
        ni = num_class[li]
        idx_s = idx[bi][np.random.RandomState(li).permutation(ni)[:lim_lo]]
        idx_sample.append(idx_s)
    
    return np.hstack(idx_sample)

class TextDataset(object):

    def clean_text(self, num='substitute'):
        """ Transform text to a-z (lowercase) and (single) whitespace. """
        for i, doc in enumerate(self.documents):
            if num == 'spell':
                doc = doc.replace('0', ' zero ')
                doc = doc.replace('1', ' one ')
                doc = doc.replace('2', ' two ')
                doc = doc.replace('3', ' three ')
                doc = doc.replace('4', ' four ')
                doc = doc.replace('5', ' five ')
                doc = doc.replace('6', ' six ')
                doc = doc.replace('7', ' seven ')
                doc = doc.replace('8', ' eight ')
                doc = doc.replace('9', ' nine ')
            elif num == 'substitute':
                doc = re.sub('(\\d+)', ' NUM ', doc)
            elif num == 'remove':
                doc = re.sub('[0-9]', ' ', doc)
            doc = doc.replace('$', ' dollar ')
            doc = doc.lower()
            doc = re.sub('[^a-z]', ' ', doc)
            doc = ' '.join(doc.split())
            self.documents[i] = doc

    def vectorize(self, **params):
        """ params: stop_words=None ('english') """
        vectorizer = CountVectorizer(**params)
        self.data = vectorizer.fit_transform(self.documents)
        self.vocab = vectorizer.get_feature_names()
        assert len(self.vocab) == self.data.shape[1]
        
    def compute_tfidf(self):
        tf_transformer = TfidfTransformer().fit(self.data)  # norm='l2' ('l1', 'l2')
        self.tfidf = tf_transformer.transform(self.data)
        
    def class_conditional_word_dist(self, Mprint=20):
        """ class conditional word distribution """
        self.class_word_dist = np.array(np.vstack([self.data[self.labels == ci, :].sum(0)/self.data[self.labels == ci, :].sum() for ci in np.unique(self.labels)]))  # num of classes x num of words
        self.labels_word = self.class_word_dist.argmax(0)
        for i in range(self.class_word_dist.shape[0]):
            print('top {} frequent words in class {}'.format(Mprint, i))
            idx = np.argsort(self.class_word_dist[i, :])[::-1][:Mprint]
            for j in range(Mprint):
                print('  {:3d}: {:10s} {:.4f}'.format(j, self.vocab[idx[j]], self.class_word_dist[i, idx[j]]))

    def data_info(self, show_classes=False):
        n, m = self.data.shape
        sparsity = self.data.nnz / n / m * 100
        print('N = {} documents, M = {} words, sparsity = {:.4f}%'.format(n, m, sparsity))
        if show_classes:
            for i in range(len(self.class_names)):
                num = sum(self.labels == i)
                print('  {:5d} documents in class {:2d} ({})'.format(num, i, self.class_names[i]))

    def show_document(self, i):
        label = self.labels[i]
        name = self.class_names[label]
        try:
            text = self.documents[i]
            wc = len(text.split())
        except AttributeError:
            text = None
            wc = 'N/A'
        print('document {}: label {} --> {}, {} words'.format(i, label, name, wc))
        try:
            vector = self.data[i, :]
            for j in range(vector.shape[1]):
                if vector[0, j] != 0:
                    print('  {:.2f} "{}" ({})'.format(vector[0, j], self.vocab[j], j))
        except AttributeError:
            pass
        return text

    def keep_documents(self, idx):
        """ Keep the documents given by the index, discard the others. """
        print('{} documents have been removed'.format(self.data.shape[0] - len(idx)))
        self.documents = [self.documents[i] for i in idx]
        self.labels = self.labels[idx]
        self.data = self.data[idx, :]

    def keep_words(self, idx):
        """ Keep the words given by the index, discard the others. """
        print('{} words have been removed'.format(self.data.shape[1] - len(idx)))
        self.data = self.data[:, idx]
        self.vocab = [self.vocab[i] for i in idx]

    def remove_short_documents(self, nwords, vocab='selected'):
        """ Remove a document if it contains less than nwords. """
        if vocab == 'selected':
            # Word count with selected vocabulary.
            wc = self.data.sum(axis=1)
            wc = np.squeeze(np.asarray(wc))
        else:  # elif vocab is 'full':
            # Word count with full vocabulary.
            wc = np.empty(self.data.shape[0], dtype=np.int)
            for i, doc in enumerate(self.documents):
                wc[i] = len(doc.split())
        idx = np.argwhere(wc >= nwords).squeeze()
        self.keep_documents(idx)
        return wc

    def keep_top_words(self, M, Mprint=20):
        """ Keep in the vocaluary the M words who appear most often. """
        freq = self.data.sum(axis=0)
        freq = np.squeeze(np.asarray(freq))
        idx = np.argsort(freq)[::-1]
        idx = idx[:M]
        self.keep_words(idx)
        print('most frequent words')
        for i in range(Mprint):
            print('  {:3d}: {:10s} {:6d} counts'.format(i, self.vocab[i], freq[idx][i]))
        return freq[idx]
    
    def sample_words(self, M):
        m = self.data.shape[1]
        idx = random.sample(set(np.arange(m)), M)
        self.keep_words(idx)
    
    def remove_frequent_words(self, sparsity_b1=0.2, sparsity_b2=0.002):
        """ words that appear in over a certain fraction of the data-sets are removed. """
        freq = self.data.astype(bool).sum(axis=0)
        freq_ratio = np.squeeze(np.asarray(freq))/self.data.shape[0]
        idx = [i for i, fr in enumerate(freq_ratio) if sparsity_b1 >= fr >= sparsity_b2]
        self.keep_words(idx)
        return freq_ratio[idx]    
    
    def normalize(self, norm='l2'):
        data = self.data.astype(np.float64)
        self.data = sklearn.preprocessing.normalize(data, axis=1, norm=norm)

    def remove_encoded_images(self, freq=1e3):
        widx = self.vocab.index('ax')
        wc = self.data[:, widx].toarray().squeeze()
        idx = np.argwhere(wc < freq).squeeze()
        self.keep_documents(idx)
        return wc
    

class Text20News(TextDataset):
    def __init__(self, **params):
        """
        params:
            subset='train' ('train', 'test', 'all')
            categories=None (list of category names to load)
            shuffle=True (bool)
            random_state=42 (int)
            remove=() (tuple, subset of ('headers', 'footers', 'quotes'))
        categories:
             'alt.atheism',
             'comp.graphics',
             'comp.os.ms-windows.misc',
             'comp.sys.ibm.pc.hardware',
             'comp.sys.mac.hardware',
             'comp.windows.x',
             'misc.forsale',
             'rec.autos',
             'rec.motorcycles',
             'rec.sport.baseball',
             'rec.sport.hockey',
             'sci.crypt',
             'sci.electronics',
             'sci.med',
             'sci.space',
             'soc.religion.christian',
             'talk.politics.guns',
             'talk.politics.mideast',
             'talk.politics.misc',
             'talk.religion.misc'
        """
        dataset = datasets.fetch_20newsgroups(**params)
        self.documents = dataset.data
        self.labels = dataset.target
        self.class_names = dataset.target_names
        assert max(self.labels) + 1 == len(self.class_names)
        n, c = len(self.documents), len(self.class_names)
        print('N = {} documents, C = {} classes'.format(n, c))

"""
ref:
https://github.com/mdeff/cnn_graph/blob/master/rcv1.ipynb
https://github.com/XifengGuo/DEC-keras/blob/2438070110b17b4fb9bc408c11d776fc1bd1bd56/datasets.py
http://www.ai.mit.edu/projects/jmlr/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm
"""


class TextRCV1(TextDataset):
    def __init__(self, data_dir='./data/RCV1', subset='all', categories=None):
        """
        params:
            data_dir='./data/RCV1' download dataset ref: https://github.com/XifengGuo/DEC-keras/blob/2438070110b17b4fb9bc408c11d776fc1bd1bd56/data/reuters/get_data.sh
            subset='all' ('train', 'test', 'all')
            categories=None (list of category names to load)
        level categories:
            1st_levl:{'CCAT', 'GCAT', 'MCAT', 'ECAT'}
            2nd_level:{'C11','C12','C13','C14','C15','C16','C17','C18','C21','C22','C23','C24',
            'C31','C32','C33','C34','C41','C42','E11','E12','E13','E14','E21','E31',
            'E41','E51','E61','E71','G15','GCRIM','GDEF','GDIP','GDIS','GENT','GENV',
            'GFAS','GHEA','GJOB','GMIL','GOBIT','GODD','GPOL','GPRO','GREL','GSCI',
            'GSPO','GTOUR','GVIO','GVOTE','GWEA','GWELF','M11','M12','M13','M14'}
        """
        did_to_cat = {}
        with open(os.path.join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
            for line in fin.readlines():
                line = line.strip().split(' ')
                cat = line[0]
                did = int(line[1])
                if cat in categories:
                    did_to_cat[did] = did_to_cat.get(did, []) + [cat]
            # keep only the single-label ones
            did_to_cat = {k: v for k, v in did_to_cat.items() if len(v) == 1}

        dat_list = ['lyrl2004_tokens_test_pt0.dat',
                    'lyrl2004_tokens_test_pt1.dat',
                    'lyrl2004_tokens_test_pt2.dat',
                    'lyrl2004_tokens_test_pt3.dat',
                    'lyrl2004_tokens_train.dat']

        if subset == 'train':
            dat_list = dat_list[-1:]
        elif subset == 'test':
            dat_list = dat_list[:-1]

        data = []
        target = []
        cat_to_cid = dict(zip(categories, list(range(len(categories)))))

        for dat in dat_list:
            with open(os.path.join(data_dir, dat)) as fin:
                fin_lines = fin.readlines()
                for li, line in enumerate(fin_lines):
                    #                     print(li, line)
                    if line.startswith('.I'):
                        did = int(line.strip().split(' ')[1])
                    elif line.startswith('.W'):
                        doc = ''
                    elif line != '\n':
                        doc += line
                    else:  # blank line
                        assert doc != ''  # stacked doc
                        if did in did_to_cat:  # append to data and target lists
                            data.append(doc)
                            target.append([cat_to_cid[d] for d in did_to_cat[did]])

        #         print((len(data), 'and', len(did_to_cat)))
        #         assert len(data) == len(did_to_cat)

        self.documents = data
        self.labels = np.array(target).flatten()  # single label
        self.class_names = categories
        #         self.cvt_labels_onehot() # labels as array

        if isinstance(self.labels, list):
            assert max(sum(self.labels, [])) + 1 == len(self.class_names)
        else:
            assert max(self.labels) + 1 == len(self.class_names)

        n, c = len(self.documents), len(self.class_names)
        print('N = {} documents, C = {} classes'.format(n, c))

    def cvt_labels_onehot(self, idx):
        n, c = len(self.documents), len(self.class_names)
        labels = np.zeros((len(idx), c), dtype=bool)
        print('converting label names to mat...')
        for i, ni in enumerate(trange(idx)):
            labels[i, self.labels[ni]] = True
        self.labels_onehot = csr_matrix(labels)

这是我在GitHub上找的一段代码但是我用相对路径和绝对路径都报错了，找不到原因。

Traceback (most recent call last):
  File "d:\program\hg_general_submodular_weights-main\hg_general_submodular_weights-main\python\main_rcv1.py", line 21, in <module>
    dataset = data.TextRCV1(data_dir='./data/RCV1', subset='all', categories=categories)
  File "d:\program\hg_general_submodular_weights-main\hg_general_submodular_weights-main\python\data.py", line 233, in __init__
    with open(os.path.join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
FileNotFoundError: [Errno 2] No such file or directory: './data/RCV1\\rcv1-v2.topics.qrels'

这是我的文件路径：

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除
收藏举报

3条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
阿里嘎多学长 2023-04-10 16:58
关注
以下内容部分参考ChatGPT模型：

报错信息显示找不到文件'./data/RCV1\\rcv1-v2.topics.qrels'，这个问题可能是因为文件路径的斜杠问题导致的。可以尝试用os.path.join()函数来拼接路径，而不用手动添加斜杠。例如：

with open(os.path.join(data_dir, 'rcv1-v2.topics.qrels')) as fin:

另外，还需要确保文件路径的正确性，可以使用os.path.exists()函数来检查文件是否存在。例如：

if not os.path.exists(os.path.join(data_dir, 'rcv1-v2.topics.qrels')): print('File not found')

这样可以帮助我们确定文件路径是否正确。如果还是无法解决问题，可以检查一下文件名是否正确，或者尝试使用绝对

如果我的建议对您有帮助、请点击采纳、祝您生活愉快
解决 1
无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

FileNotFoundError: [Errno 2] No such file or directory: python pytorch
2022-07-16 19:41

回答 3 已采纳 1.有没有可能是路径包含中文名的缘故2.或者你试试绝对路径测试一下能否读取每次找到的文件不一样应该是设定打乱数据集了吧，但是没有固定随机数，所以读取的不一样
请问这是为什么 FileNotFoundError: [Errno 2] No such file or directory python pytorch 有问必答
2022-04-11 16:50

回答 3 已采纳 tag_path = os.path.join(self.path, 'imangesPng', tag_name) img_path = os.path.join(self.path
FileNotFoundError: [Errno 2] No such file or directory python
2021-08-28 13:01

回答 2 已采纳这是绝对路径和相对路径的问题。你可以把“选课.xls 文件”放在和你现在编写的代码.py文件相同的文件夹下即可运行成功。或者，用绝对路径来代替“选课.xls 文件”，从你的盘符名称开始写；可以右击“
python调用百度API实现车辆识别时遇到 FileNotFoundError: [Errno 2] No such file or directory 的解决办法
2020-12-22 04:26

在参照了其他人的博客：python实现车辆型号识别后，我也想动手实现自己的车辆识别python程序时时，遇到了这样的问题：我的图片和python在同一文件夹下，但是却找不到，源程序如下： from aip import ...
python 相对路径FileNotFoundError: [Errno 2] No such file or directory: 'nihao.txt' macos python
2022-07-24 09:37

回答 3 已采纳你是直接运行的ceshi.py文件吗？你把这个目录加上试试
python读取文件时出错，pycharm报错提示：[Errno 2] No such file or directory:"test" python
2021-08-16 10:51

回答 2 已采纳 test.txt 要加文件后缀的完整文件名
FileNotFoundError: [Errno 2] No such file or directory 文件找不到 python
2021-10-07 18:12

回答 1 已采纳问题被我误打误撞解决，但是并不清楚原理。我试用过绝对路径（就是直接data.txt右键copy path）放进去，但是显示的问题是： SyntaxError: (unicode error) 'uni
python 报错FileNotFoundError: [Errno 2] No such file or directory
2023-08-18 13:50

m0_57755104的博客错误提示中提到了 "unicodeescape" 编码无法解码位在 2-3 位置的字节，这表明在读取 "news.txt" 文件时，遇到了一个未被正确解码的 Unicode 字符。这个错误提示是因为在代码中使用了错误的字符编码，错误提示中提到...
pdf保存提示FileNotFoundError: [Errno 2] No such file or directory: python
2022-08-22 16:41

回答 1 已采纳文件名不能包含一些特殊字符（/:*?"<>|），需要进行替换，比如替换成下划线（_）横杠（-）之类的。 # 文件名中不允许包含一些特殊字符，需进行替换 filename = re.sub(
CMD报错：python: can't open file 'interact.py': [Errno 2] No such file or directory github python 深度学习
2021-12-31 16:02

回答 1 已采纳问题1：你的这些函数中缺少附件，所以有些函数没有，运行及会报错，建议你找一个完整的。问题2：你pycharm中所编译的python程序就是你本机的pip文件中的python,所以就不需要了问题3：你应
FileNotFoundError: [Errno 2] No such file or directory: '.\\static\\assets\\img\\ciyun.jpg' python
2021-09-19 22:58

回答 1 已采纳 @app.task def gen_world(article_content,sha1): article_lcut, article_content_important = seg_de
Python报错:FileNotFoundError: [Errno 2] No such file or directory
2022-05-30 10:03

m0_53428286的博客调用文件的时候，该python项目下的文件夹找不到文件，记得将调用文件放到.py文件下的同样目录
解决python报错FileNotFoundError: [Errno 2] No such file or directory
2022-01-17 20:50

ProblemTerminator的博客解决python报错FileNotFoundError: [Errno 2] No such file or directory
【报错记录】Python文件读取时报错FileNotFoundError: [Errno 2] No such file or directory: ‘xxx‘
2020-10-05 08:21

Magical-E的博客 FileNotFoundError: [Errno 2] No such file or directory: 'mbox.txt' 则可能和编辑器的工作区有关。例如我使用的是VSCode，会以打开的整个文件夹Python为工作区，文件的路径默认相对工作区，而不是py文件。报错...
4种方法转义字符解决报错FileNotFoundError: [Errno 2] No such file or directory
2019-10-27 20:39

代码写注释的博客 4种方法解决报错FileNotFoundError: [Errno 2] No such file or directory 或ModuleNotFoundError: No module named xxx
Python之FileNotFoundError: [Errno 2] No such file or directory问题处理
2023-02-10 17:49

qq_41845402的博客 Python之FileNotFoundError: [Errno 2] No such file or directory问题处理
python中filenotfounderror_python - FileNotFoundError: [Errno 2] No such file or directory:
2020-11-24 06:29

weixin_39750410的博客 line 16, in create_file newTXT = open(name, "w") FileNotFoundError: [Errno 2] No such file or directory: From what I look on google this error mean wrong path or file not found. But I check with sys....
Linux python Popen报错：FileNotFoundError: [Errno 2] No such file or directory & kill() 后进程仍然运行
2021-06-01 23:42

xhr_embedded的博客从 window 环境的 Popen 复制粘贴到 Linux 使用，竟然无法使用，报错 No such file or directory ，把我搞郁闷了，完全一样的 code，怎么 linux 上就找不到文件了？原来，linux 上 Popen 使用字符串...
python中os.mkdir报错 FileNotFoundError: [Errno 2] No such file or directory:
2021-09-09 14:08

Ha_这里有坑的博客创建文件夹时： if not os.path.exists(output_dir + 'model_epoch...FileNotFoundError: [Errno 2] No such file or directory: 原因分析： os.mkdir 只能生成下一级的目录文件。若要想生成多个子路径下的文件，需要
出现报错FileNotFoundError: [Errno 2] No such file or directory：xxx解决方法
2021-02-09 17:09

IrideLimer的博客 FileNotFoundError: [Errno 2] No such file or directory: 'xxx/xxx/xxx/xxx_.xxx.xxx.csv' 我检查了一下确实没有这个路径的文件，解决方法：按照报错的路径新建文件即可
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
已结题（查看结题原因） 4月12日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
修改了问题 4月10日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
修改了问题 4月10日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 4月10日

悬赏问题

¥100 set_link_state
¥15 虚幻5 UE美术毛发渲染
¥15 CVRP 图论物流运输优化
¥15 Tableau online 嵌入ppt失败
¥100 支付宝网页转账系统不识别账号
¥15 基于单片机的靶位控制系统
¥15 真我手机蓝牙传输进度消息被关闭了，怎么打开？(关键词-消息通知)
¥15 装 pytorch 的时候出了好多问题，遇到这种情况怎么处理？
¥20 IOS游览器某宝手机网页版自动立即购买JavaScript脚本
¥15 手机接入宽带网线，如何释放宽带全部速度

python数据导入报错FileNotFoundError: [Errno 2] No such file or directory

3条回答 默认 最新

以下内容部分参考ChatGPT模型：

如果我的建议对您有帮助、请点击采纳、祝您生活愉快

问题事件

悬赏问题

3条回答默认最新