torch.distributed.DistNetworkError: Connection reset by peer
# 获取脚本路径和日志路径
SCRIPT_PATH="$(cd "$(dirname "$0")"; pwd -P)"
LOG_DIR="${SCRIPT_PATH}/logs"
mkdir -p "$LOG_DIR"
LOG_FILE="${LOG_DIR}/train.log-$(date +'%Y%m%d')"
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
# 固定 Master 地址和端口
MASTER_ADDR="(隐私,不方便透露)"
MASTER_PORT="39500"
# 解析 GPU_ID
GPU_IDS=$1
export CUDA_VISIBLE_DEVICES="$GPU_IDS"
# 记录启动信息
echo "-----------------------------------------------" | tee -a "$LOG_FILE"
echo "Starting distributed training" | tee -a "$LOG_FILE"
echo "Master Address: $MASTER_ADDR" | tee -a "$LOG_FILE"
echo "Master Port: $MASTER_PORT" | tee -a "$LOG_FILE"
echo "CUDA_VISIBLE_DEVICES: $CUDA_VISIBLE_DEVICES" | tee -a "$LOG_FILE"
echo "-----------------------------------------------" | tee -a "$LOG_FILE"
# 运行 Python 训练脚本
run_script() {
local script_name=$1
echo "Running $script_name..." | tee -a "$LOG_FILE"
torchrun --nproc_per_node=1 --nnodes=2 \
--node_rank=0 --master_addr=$MASTER_ADDR --master_port=$MASTER_PORT \
"$script_name.py" --gpu_id "$GPU_IDS" 2>&1 | tee -a "$LOG_FILE"
if [ $? -ne 0 ]; then
echo "Error in $script_name. Check logs." | tee -a "$LOG_FILE"
exit 1
fi
}
run_script MyTrain_Val_ddp
run_script MyTesting_ddp
run_script eval_ddp
echo "All scripts executed successfully." | tee -a "$LOG_FILE"
关闭防火墙,检查了网络连接,还是出现这个报错。
这是一个多机多卡的分布式学习任务,一直因为这个无法启动