我使用的环境如下:
torch 2.0.0 torchvision等是conda安装,应该没问题
cuda117
cudnn 8.6
然后跟着csdn的一些博客,我更改了训练的目标种类等,其他没动,跑通过demo.py,预测成功。
然后我看这个报错停在了torch.nn的conv2那里,更具体是前向传播不行,貌似,我也不确定。。
好折磨啊,这玩意真是头疼,有没有人曾经遇见过这错误啊……真的诚心求个解答。。
完整报错如下:
An error has been caught in function 'launch', process 'MainProcess' (78960), thread 'MainThread' (140630018753152):
Traceback (most recent call last):
File "tools/train.py", line 137, in <module>
launch(
└ <function launch at 0x7fe65c276550>
> File "/home/lulu/YOLOX/yolox/core/launch.py", line 98, in launch
main_func(*args)
│ └ (╒═══════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════...
└ <function main at 0x7fe64fb89ee0>
File "tools/train.py", line 118, in main
trainer.train()
│ └ <function Trainer.train at 0x7fe64e9df670>
└ <yolox.core.trainer.Trainer object at 0x7fe64e9debe0>
File "/home/lulu/YOLOX/yolox/core/trainer.py", line 76, in train
self.train_in_epoch()
│ └ <function Trainer.train_in_epoch at 0x7fe64e9dfe50>
└ <yolox.core.trainer.Trainer object at 0x7fe64e9debe0>
File "/home/lulu/YOLOX/yolox/core/trainer.py", line 85, in train_in_epoch
self.train_in_iter()
│ └ <function Trainer.train_in_iter at 0x7fe64e9dfee0>
└ <yolox.core.trainer.Trainer object at 0x7fe64e9debe0>
File "/home/lulu/YOLOX/yolox/core/trainer.py", line 91, in train_in_iter
self.train_one_iter()
│ └ <function Trainer.train_one_iter at 0x7fe64e9dff70>
└ <yolox.core.trainer.Trainer object at 0x7fe64e9debe0>
省略一大串
File "/home/lulu/YOLOX/yolox/models/network_blocks.py", line 51, in forward
return self.act(self.bn(self.conv(x)))
│ │ │ └ tensor([[[[110., 110., 110., ..., 177., 179., 178.],
│ │ │ [110., 110., 110., ..., 180., 181., 174.],
│ │ │ [110., ...
│ │ └ BaseConv(
│ │ (conv): Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
│ │ (bn): BatchNorm2d(16, eps...
│ └ BaseConv(
│ (conv): Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
│ (bn): BatchNorm2d(16, eps...
└ BaseConv(
(conv): Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn): BatchNorm2d(16, eps...
File "/home/lulu/anaconda3/envs/yolox/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
│ │ └ {}
│ └ (tensor([[[[110., 110., 110., ..., 177., 179., 178.],
│ [110., 110., 110., ..., 180., 181., 174.],
│ [110.,...
└ <bound method Conv2d.forward of Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)>
File "/home/lulu/anaconda3/envs/yolox/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 463, in forward
return self._conv_forward(input, self.weight, self.bias)
│ │ │ │ └ Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
│ │ │ └ Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
│ │ └ tensor([[[[110., 110., 110., ..., 177., 179., 178.],
│ │ [110., 110., 110., ..., 180., 181., 174.],
│ │ [110., ...
│ └ <function Conv2d._conv_forward at 0x7fe65ed23940>
└ Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
File "/home/lulu/anaconda3/envs/yolox/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 459, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
│ │ │ │ │ │ └ (1, 1)
│ │ │ │ │ └ Conv2d(12, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
│ │ │ │ └ None
│ │ │ └ Parameter containing:
│ │ │ tensor([[[[ 9.0119e-04, 1.1377e-02, 1.3645e-02],
│ │ │ [ 8.7322e-03, -8.7065e-04, -8.0473e-03],
│ │ │ ...
│ │ └ tensor([[[[110., 110., 110., ..., 177., 179., 178.],
│ │ [110., 110., 110., ..., 180., 181., 174.],
│ │ [110., ...
│ └ <built-in method conv2d of type object at 0x7fe6c419c500>
└ <module 'torch.nn.functional' from '/home/lulu/anaconda3/envs/yolox/lib/python3.8/site-packages/torch/nn/functional.py'>
RuntimeError: cuDNN error: CUDNN_STATUS_NOT_INITIALIZED
以上,请教了!