自动控制自己 2022-06-24 20:46
浏览 18
已结题

关于#Pytorch#的问题,如何解决?

问题遇到的现象和发生背景

Pytorch模型训练CUDA一直报错

问题相关代码,请勿粘贴截图
from __future__ import division
from models import *
from utils.logger import *
from utils.utils import *
from utils.datasets import *
from utils.parse_config import *
from test import evaluate
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import warnings
warnings.filterwarnings("ignore")

from terminaltables import AsciiTable

import os
import sys
import time
import datetime
import argparse
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
from torch.autograd import Variable
import torch.optim as optim

"""
--data_config config/coco.data  
--pretrained_weights weights/darknet53.conv.74
"""

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--epochs", type=int, default=100, help="number of epochs")
    parser.add_argument("--batch_size", type=int, default=1, help="size of each image batch")
    parser.add_argument("--gradient_accumulations", type=int, default=2, help="number of gradient accums before step")
    parser.add_argument("--model_def", type=str, default="config/yolov3.cfg", help="path to model definition file")
    parser.add_argument("--data_config", type=str, default="config/coco.data", help="path to data config file")
    parser.add_argument("--pretrained_weights", type=str, help="if specified starts from checkpoint model")
    parser.add_argument("--n_cpu", type=int, default=0, help="number of cpu threads to use during batch generation")
    parser.add_argument("--img_size", type=int, default=416, help="size of each image dimension")
    parser.add_argument("--checkpoint_interval", type=int, default=100, help="interval between saving model weights")
    parser.add_argument("--evaluation_interval", type=int, default=300, help="interval evaluations on validation set")
    parser.add_argument("--compute_map", default=False, help="if True computes mAP every tenth batch")
    parser.add_argument("--multiscale_training", default=True, help="allow for multi-scale training")
    opt = parser.parse_args()
    print(opt)

    logger = Logger("logs")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    os.makedirs("output", exist_ok=True)
    os.makedirs("checkpoints", exist_ok=True)

    # Get data configuration
    data_config = parse_data_config(opt.data_config)
    train_path = data_config["train"]
    valid_path = data_config["valid"]
    class_names = load_classes(data_config["names"])

    # Initiate model
    model = Darknet(opt.model_def).to(device)
    model.apply(weights_init_normal)

    # If specified we start from checkpoint
    if opt.pretrained_weights:
        if opt.pretrained_weights.endswith(".pth"):
            model.load_state_dict(torch.load(opt.pretrained_weights))
        else:
            model.load_darknet_weights(opt.pretrained_weights)

    # Get dataloader
    dataset = ListDataset(train_path, augment=True, multiscale=opt.multiscale_training)
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=opt.batch_size,
        shuffle=True,
        num_workers=opt.n_cpu,
        pin_memory=True,
        collate_fn=dataset.collate_fn,
    )

    optimizer = torch.optim.Adam(model.parameters())

    metrics = [
        "grid_size",
        "loss",
        "x",
        "y",
        "w",
        "h",
        "conf",
        "cls",
        "cls_acc",
        "recall50",
        "recall75",
        "precision",
        "conf_obj",
        "conf_noobj",
    ]

    for epoch in range(opt.epochs):
        model.train()
        start_time = time.time()
        for batch_i, (_, imgs, targets) in enumerate(dataloader):
            batches_done = len(dataloader) * epoch + batch_i

            imgs = Variable(imgs.to(device))
            targets = Variable(targets.to(device), requires_grad=False)
            print ('imgs',imgs.shape)
            print ('targets',targets.shape)
            loss, outputs = model(imgs, targets)
            loss.backward()

            if batches_done % opt.gradient_accumulations:
                # Accumulates gradient before each step
                optimizer.step()
                optimizer.zero_grad()

            # ----------------
            #   Log progress
            # ----------------

            log_str = "\n---- [Epoch %d/%d, Batch %d/%d] ----\n" % (epoch, opt.epochs, batch_i, len(dataloader))

            metric_table = [["Metrics", *[f"YOLO Layer {i}" for i in range(len(model.yolo_layers))]]]

            # Log metrics at each YOLO layer
            for i, metric in enumerate(metrics):
                formats = {m: "%.6f" for m in metrics}
                formats["grid_size"] = "%2d"
                formats["cls_acc"] = "%.2f%%"
                row_metrics = [formats[metric] % yolo.metrics.get(metric, 0) for yolo in model.yolo_layers]
                metric_table += [[metric, *row_metrics]]

                # Tensorboard logging
                tensorboard_log = []
                for j, yolo in enumerate(model.yolo_layers):
                    for name, metric in yolo.metrics.items():
                        if name != "grid_size":
                            tensorboard_log += [(f"{name}_{j+1}", metric)]
                tensorboard_log += [("loss", loss.item())]
                logger.list_of_scalars_summary(tensorboard_log, batches_done)

            log_str += AsciiTable(metric_table).table
            log_str += f"\nTotal loss {loss.item()}"

            # Determine approximate time left for epoch
            epoch_batches_left = len(dataloader) - (batch_i + 1)
            time_left = datetime.timedelta(seconds=epoch_batches_left * (time.time() - start_time) / (batch_i + 1))
            log_str += f"\n---- ETA {time_left}"

            print(log_str)

            model.seen += imgs.size(0)

        if epoch % opt.evaluation_interval == 0:
            print("\n---- Evaluating Model ----")
            # Evaluate the model on the validation set
            precision, recall, AP, f1, ap_class = evaluate(
                model,
                path=valid_path,
                iou_thres=0.5,
                conf_thres=0.5,
                nms_thres=0.5,
                img_size=opt.img_size,
                batch_size=1,
            )
            evaluation_metrics = [
                ("val_precision", precision.mean()),
                ("val_recall", recall.mean()),
                ("val_mAP", AP.mean()),self.outlayer
                ("val_f1", f1.mean()),
            ]
            logger.list_of_scalars_summary(evaluation_metrics, epoch)

            # Print class APs and mAP
            ap_table = [["Index", "Class name", "AP"]]
            for i, c in enumerate(ap_class):
                ap_table += [[c, class_names[c], "%.5f" % AP[i]]]
            print(AsciiTable(ap_table).table)
            print(f"---- mAP {AP.mean()}")

        if epoch % opt.checkpoint_interval == 0:
            torch.save(model.state_dict(), f"checkpoints/yolov3_ckpt_%d.pth" % epoch)


运行结果及报错内容
D:\pytorch\envs\tensorflow\python.exe D:/BaiduNetdiskDownload/yolo/main/PyTorch-YOLOv3/train.py --model_def config/yolov3-custom.cfg --data_config config/custom.data --pretrained_weights weights/darknet53.conv.74
2022-06-24 20:43:11.900568: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cudart64_110.dll
Namespace(epochs=100, batch_size=1, gradient_accumulations=2, model_def='config/yolov3-custom.cfg', data_config='config/custom.data', pretrained_weights='weights/darknet53.conv.74', n_cpu=0, img_size=416, checkpoint_interval=100, evaluation_interval=300, compute_map=False, multiscale_training=True)
2022-06-24 20:43:14.321195: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library nvcuda.dll
2022-06-24 20:43:14.343865: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3050 Ti Laptop GPU computeCapability: 8.6
coreClock: 1.485GHz coreCount: 20 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2022-06-24 20:43:14.344091: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cudart64_110.dll
2022-06-24 20:43:14.344196: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cublas64_11.dll
2022-06-24 20:43:14.344303: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cublasLt64_11.dll
2022-06-24 20:43:14.344415: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cufft64_10.dll
2022-06-24 20:43:14.345990: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library curand64_10.dll
2022-06-24 20:43:14.346434: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cusolver64_11.dll
2022-06-24 20:43:14.346570: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cusparse64_11.dll
2022-06-24 20:43:14.346696: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library cudnn64_8.dll
2022-06-24 20:43:14.346843: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2022-06-24 20:43:14.347438: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-24 20:43:14.348414: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 3050 Ti Laptop GPU computeCapability: 8.6
coreClock: 1.485GHz coreCount: 20 deviceMemorySize: 4.00GiB deviceMemoryBandwidth: 178.84GiB/s
2022-06-24 20:43:14.348671: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2022-06-24 20:43:14.718978: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2022-06-24 20:43:14.719117: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2022-06-24 20:43:14.719192: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2022-06-24 20:43:14.719421: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 1653 MB memory) -> physical GPU (device: 0, name: NVIDIA GeForce RTX 3050 Ti Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6)
imgs torch.Size([1, 3, 320, 320])
targets torch.Size([4, 6])
torch.Size([1, 66, 10, 10])
torch.Size([1, 3, 10, 10, 22])
torch.Size([3, 4])
torch.Size([1, 66, 20, 20])
torch.Size([1, 3, 20, 20, 22])
torch.Size([3, 4])
torch.Size([1, 66, 40, 40])
torch.Size([1, 3, 40, 40, 22])
torch.Size([3, 4])
2022-06-24 20:43:18.081589: I tensorflow/stream_executor/cuda/cuda_driver.cc:789] failed to allocate 1.61G (1734213632 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2022-06-24 20:43:18.162417: I tensorflow/stream_executor/cuda/cuda_driver.cc:789] failed to allocate 1.45G (1560792320 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2022-06-24 20:43:18.232255: I tensorflow/stream_executor/cuda/cuda_driver.cc:789] failed to allocate 1.31G (1404713216 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory

---- [Epoch 0/100, Batch 0/724] ----
+------------+--------------+--------------+--------------+
| Metrics    | YOLO Layer 0 | YOLO Layer 1 | YOLO Layer 2 |
+------------+--------------+--------------+--------------+
| grid_size  | 10           | 20           | 40           |
| loss       | 81.651138    | 68.110428    | 73.541206    |
| x          | 0.116567     | 0.093047     | 0.134591     |
| y          | 0.229398     | 0.063502     | 0.076567     |
| w          | 0.794509     | 0.160264     | 1.223999     |
| h          | 0.408137     | 0.092530     | 2.021268     |
| conf       | 79.419899    | 66.988274    | 69.371574    |
| cls        | 0.682625     | 0.712811     | 0.713205     |
| cls_acc    | 25.00%       | 0.00%        | 0.00%        |
| recall50   | 0.000000     | 0.000000     | 0.000000     |
| recall75   | 0.000000     | 0.000000     | 0.000000     |
| precision  | 0.000000     | 0.000000     | 0.000000     |
| conf_obj   | 0.615010     | 0.567150     | 0.492626     |
| conf_noobj | 0.535675     | 0.477527     | 0.493937     |
+------------+--------------+--------------+--------------+
Total loss 223.30276489257812
---- ETA 0:31:38.819796
imgs torch.Size([1, 3, 320, 320])
targets torch.Size([1, 6])
torch.Size([1, 66, 10, 10])
torch.Size([1, 3, 10, 10, 22])
torch.Size([3, 1])
torch.Size([1, 66, 20, 20])
torch.Size([1, 3, 20, 20, 22])
torch.Size([3, 1])
Traceback (most recent call last):
  File "D:\BaiduNetdiskDownload\yolo\main\PyTorch-YOLOv3\train.py", line 115, in <module>
    loss, outputs = model(imgs, targets)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "D:\BaiduNetdiskDownload\yolo\main\PyTorch-YOLOv3\models.py", line 252, in forward
    x = module(x)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\modules\container.py", line 141, in forward
    input = module(input)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\modules\module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\modules\activation.py", line 758, in forward
    return F.leaky_relu(input, self.negative_slope, self.inplace)
  File "D:\pytorch\envs\tensorflow\lib\site-packages\torch\nn\functional.py", line 1618, in leaky_relu
    result = torch._C._nn.leaky_relu(input, negative_slope)
RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 4.00 GiB total capacity; 772.11 MiB already allocated; 0 bytes free; 840.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

进程已结束,退出代码1


我的解答思路和尝试过的方法
我想要达到的结果
  • 写回答

0条回答 默认 最新

    报告相同问题?

    问题事件

    • 系统已结题 7月2日
    • 创建了问题 6月24日

    悬赏问题

    • ¥15 matlab数据降噪处理,提高数据的可信度,确保峰值信号的不损失?
    • ¥15 怎么看我在bios每次修改的日志
    • ¥15 python+mysql图书管理系统
    • ¥15 Questasim Error: (vcom-13)
    • ¥15 船舶旋回实验matlab
    • ¥30 SQL 数组,游标,递归覆盖原值
    • ¥15 为什么我的数据接收的那么慢呀有没有完整的 hal 库并 代码呀有的话能不能发我一份并且我用 printf 函数显示处理之后的数据,用 debug 就不能运行了呢
    • ¥20 gitlab 中文路径,无法下载
    • ¥15 用动态规划算法均分纸牌
    • ¥30 udp socket,bind 0.0.0.0 ,如何自动选取用户访问的服务器IP来回复数据