bev-project/archive/scripts_old/START_MULTINODE_TRAINING.sh

193 lines
6.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# BEVFusion Phase 4A Stage 1 - 多机多卡训练启动脚本
# 支持: 2节点×8卡 = 16 GPUs
set -e
# ============================================================================
# 配置区域 - 根据实际环境修改
# ============================================================================
# 节点配置
MASTER_ADDR="192.168.1.101" # master节点IP
WORKER1_ADDR="192.168.1.102" # worker节点IP
MASTER_PORT="29500" # 通信端口
# 每节点GPU数量
GPUS_PER_NODE=8
NUM_NODES=2
TOTAL_GPUS=$((GPUS_PER_NODE * NUM_NODES))
# 节点列表torchpack格式hostname:gpu_count
NODES="${MASTER_ADDR}:${GPUS_PER_NODE},${WORKER1_ADDR}:${GPUS_PER_NODE}"
# ============================================================================
# 环境配置
# ============================================================================
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# NCCL配置多机通信优化
export NCCL_DEBUG=INFO # 调试信息生产环境可改为WARN
export NCCL_SOCKET_TIMEOUT=3600 # 超时时间
export NCCL_IB_DISABLE=1 # 如果没有InfiniBand设为1
export NCCL_SOCKET_IFNAME=eth0 # 网络接口名(根据实际修改)
cd /workspace/bevfusion
# ============================================================================
# 环境检查
# ============================================================================
echo "========================================================================"
echo "BEVFusion Phase 4A Stage 1 - 多机多卡训练"
echo "========================================================================"
echo ""
echo "【配置信息】"
echo " 总GPU数量: ${TOTAL_GPUS}"
echo " 节点数量: ${NUM_NODES}"
echo " 每节点GPU: ${GPUS_PER_NODE}"
echo " Master节点: ${MASTER_ADDR}"
echo " Worker节点: ${WORKER1_ADDR}"
echo " 通信端口: ${MASTER_PORT}"
echo ""
# 检查SSH连通性
echo "【环境检查】"
echo -n " 检查master节点SSH... "
if ssh -o ConnectTimeout=5 ${MASTER_ADDR} "hostname" > /dev/null 2>&1; then
echo "✓"
else
echo "✗ 失败"
echo "请配置SSH免密登录: ssh-copy-id root@${MASTER_ADDR}"
exit 1
fi
echo -n " 检查worker节点SSH... "
if ssh -o ConnectTimeout=5 ${WORKER1_ADDR} "hostname" > /dev/null 2>&1; then
echo "✓"
else
echo "✗ 失败"
echo "请配置SSH免密登录: ssh-copy-id root@${WORKER1_ADDR}"
exit 1
fi
echo -n " 检查master节点GPU... "
MASTER_GPU_COUNT=$(ssh ${MASTER_ADDR} "nvidia-smi --query-gpu=count --format=csv,noheader | head -1" 2>/dev/null || echo "0")
if [ "$MASTER_GPU_COUNT" -ge "$GPUS_PER_NODE" ]; then
echo "✓ (${MASTER_GPU_COUNT}张)"
else
echo "✗ 仅发现${MASTER_GPU_COUNT}张GPU"
exit 1
fi
echo -n " 检查worker节点GPU... "
WORKER_GPU_COUNT=$(ssh ${WORKER1_ADDR} "nvidia-smi --query-gpu=count --format=csv,noheader | head -1" 2>/dev/null || echo "0")
if [ "$WORKER_GPU_COUNT" -ge "$GPUS_PER_NODE" ]; then
echo "✓ (${WORKER_GPU_COUNT}张)"
else
echo "✗ 仅发现${WORKER_GPU_COUNT}张GPU"
exit 1
fi
# 检查数据集路径
echo -n " 检查master数据集... "
if ssh ${MASTER_ADDR} "[ -d /data/nuscenes ]"; then
echo "✓"
else
echo "✗ /data/nuscenes不存在"
exit 1
fi
echo -n " 检查worker数据集... "
if ssh ${WORKER1_ADDR} "[ -d /data/nuscenes ]"; then
echo "✓"
else
echo "✗ /data/nuscenes不存在"
exit 1
fi
echo ""
echo "✅ 环境检查通过!"
echo ""
# ============================================================================
# 训练配置
# ============================================================================
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml"
PRETRAINED_SWINT="/data/pretrained/swint-nuimages-pretrained.pth"
LOAD_FROM="/data/runs/phase4a_stage1/epoch_1.pth"
# 检查配置文件
if [ ! -f "$CONFIG" ]; then
echo "❌ 配置文件不存在: $CONFIG"
exit 1
fi
if [ ! -f "$PRETRAINED_SWINT" ]; then
echo "❌ 预训练模型不存在: $PRETRAINED_SWINT"
exit 1
fi
if [ ! -f "$LOAD_FROM" ]; then
echo "❌ 初始权重不存在: $LOAD_FROM"
exit 1
fi
# 日志文件
LOG_FILE="phase4a_stage1_multinode_$(date +%Y%m%d_%H%M%S).log"
echo "【训练参数】"
echo " 配置文件: $CONFIG"
echo " 预训练模型: $PRETRAINED_SWINT"
echo " 初始权重: $LOAD_FROM"
echo " Batch/GPU: 1"
echo " Workers/GPU: 0"
echo " 日志文件: $LOG_FILE"
echo ""
# ============================================================================
# 启动训练
# ============================================================================
echo "========================================================================"
echo "开始多机训练..."
echo "========================================================================"
echo ""
# 使用torchpack dist-run启动多机训练
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
PATH=/opt/conda/bin:$PATH \
PYTHONPATH=/workspace/bevfusion:$PYTHONPATH \
/opt/conda/bin/torchpack dist-run \
-np ${TOTAL_GPUS} \
-H ${NODES} \
/opt/conda/bin/python tools/train.py \
${CONFIG} \
--model.encoders.camera.backbone.init_cfg.checkpoint ${PRETRAINED_SWINT} \
--load_from ${LOAD_FROM} \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
2>&1 | tee "$LOG_FILE"
TRAIN_EXIT_CODE=$?
echo ""
echo "========================================================================"
if [ $TRAIN_EXIT_CODE -eq 0 ]; then
echo "✅ 训练完成!"
else
echo "❌ 训练异常退出 (exit code: $TRAIN_EXIT_CODE)"
fi
echo "========================================================================"
echo ""
echo "日志文件: $LOG_FILE"
echo "Checkpoints: /data/runs/phase4a_stage1/"
echo ""
exit $TRAIN_EXIT_CODE