多机多卡分布式启动问题
bash脚本无法启动
/home/server/anaconda3/envs/cod/lib/python3.9/site-packages/torch/distributed/launch.py:180: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
卡在这里了
网络没问题,防火请关了,torch.distributed.launch也没问题,其他人也能跑
#!/bin/bash
export TORCH_USE_CUDA_DSA=1
export CUDA_LAUNCH_BLOCKING=1
SCRIPT_PATH="$(cd "$(dirname "$0")"; pwd -P)"
SCRIPT_NAME=$(basename "$0")
export NCCL_DEBUG=INFO
LOG_DATE="$(date +'%Y%m%d')"
LOG_DIR="${SCRIPT_PATH}/logs"
LOG_FILE="${LOG_DIR}/${SCRIPT_NAME}.log-${LOG_DATE}"
sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1
sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1
MASTER_PORT=$(shuf -i 10000-65000 -n 1)
echo $MASTER_PORT > /home/server/project/gtc/ddp_master_port.txt
# 创建日志目录
mkdir -p "${LOG_DIR}"
# ------------------ 解析 GPU 参数 ------------------
GPU_IDS=""
if [[ $# -gt 0 ]]; then
GPU_IDS="$1"
fi
# 如果没有提供GPU_IDS,则默认使用所有GPU
if [[ -z "$GPU_IDS" ]]; then
GPU_IDS="0,1,2,3" # 这里可以根据实际情况修改默认值
fi
export CUDA_VISIBLE_DEVICES=$GPU_IDS
echo "Using GPUs: ${GPU_IDS:-All available}"
run_script() {
local script_name=$1
local MASTER_PORT=$2
echo "-----------------------------------------------"
echo "Running $script_name with GPUs ${GPU_IDS:-All}..."
echo "-----------------------------------------------"
export MASTER_PORT=$MASTER_PORT
echo "开始训练"
/home/server/anaconda3/envs/cod/bin/python -m torch.distributed.launch \
--nproc_per_node=1 \
--nnodes=2 \
--node_rank=0 \
--master_addr="IP" \
--master_port=$MASTER_PORT \
"$script_name".py \
2>&1 | tee -a "$LOG_FILE"
echo "执行完毕"
echo "$script_name completed. Logs appended to $LOG_FILE"
}
run_script My_Train_dis 39500
run_script My_Testing_dis 39500
run_script eval_dis 39500
echo "All scripts executed. Check logs at $LOG_FILE"