193 lines
6.0 KiB
Bash
Executable File
193 lines
6.0 KiB
Bash
Executable File
#!/bin/bash
|
||
# BEVFusion Phase 4A Stage 1 - 多机多卡训练启动脚本
|
||
# 支持: 2节点×8卡 = 16 GPUs
|
||
|
||
set -e
|
||
|
||
# ============================================================================
|
||
# 配置区域 - 根据实际环境修改
|
||
# ============================================================================
|
||
|
||
# 节点配置
|
||
MASTER_ADDR="192.168.1.101" # master节点IP
|
||
WORKER1_ADDR="192.168.1.102" # worker节点IP
|
||
MASTER_PORT="29500" # 通信端口
|
||
|
||
# 每节点GPU数量
|
||
GPUS_PER_NODE=8
|
||
NUM_NODES=2
|
||
TOTAL_GPUS=$((GPUS_PER_NODE * NUM_NODES))
|
||
|
||
# 节点列表(torchpack格式:hostname:gpu_count)
|
||
NODES="${MASTER_ADDR}:${GPUS_PER_NODE},${WORKER1_ADDR}:${GPUS_PER_NODE}"
|
||
|
||
# ============================================================================
|
||
# 环境配置
|
||
# ============================================================================
|
||
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
# NCCL配置(多机通信优化)
|
||
export NCCL_DEBUG=INFO # 调试信息(生产环境可改为WARN)
|
||
export NCCL_SOCKET_TIMEOUT=3600 # 超时时间
|
||
export NCCL_IB_DISABLE=1 # 如果没有InfiniBand,设为1
|
||
export NCCL_SOCKET_IFNAME=eth0 # 网络接口名(根据实际修改)
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
# ============================================================================
|
||
# 环境检查
|
||
# ============================================================================
|
||
|
||
echo "========================================================================"
|
||
echo "BEVFusion Phase 4A Stage 1 - 多机多卡训练"
|
||
echo "========================================================================"
|
||
echo ""
|
||
echo "【配置信息】"
|
||
echo " 总GPU数量: ${TOTAL_GPUS}"
|
||
echo " 节点数量: ${NUM_NODES}"
|
||
echo " 每节点GPU: ${GPUS_PER_NODE}"
|
||
echo " Master节点: ${MASTER_ADDR}"
|
||
echo " Worker节点: ${WORKER1_ADDR}"
|
||
echo " 通信端口: ${MASTER_PORT}"
|
||
echo ""
|
||
|
||
# 检查SSH连通性
|
||
echo "【环境检查】"
|
||
echo -n " 检查master节点SSH... "
|
||
if ssh -o ConnectTimeout=5 ${MASTER_ADDR} "hostname" > /dev/null 2>&1; then
|
||
echo "✓"
|
||
else
|
||
echo "✗ 失败"
|
||
echo "请配置SSH免密登录: ssh-copy-id root@${MASTER_ADDR}"
|
||
exit 1
|
||
fi
|
||
|
||
echo -n " 检查worker节点SSH... "
|
||
if ssh -o ConnectTimeout=5 ${WORKER1_ADDR} "hostname" > /dev/null 2>&1; then
|
||
echo "✓"
|
||
else
|
||
echo "✗ 失败"
|
||
echo "请配置SSH免密登录: ssh-copy-id root@${WORKER1_ADDR}"
|
||
exit 1
|
||
fi
|
||
|
||
echo -n " 检查master节点GPU... "
|
||
MASTER_GPU_COUNT=$(ssh ${MASTER_ADDR} "nvidia-smi --query-gpu=count --format=csv,noheader | head -1" 2>/dev/null || echo "0")
|
||
if [ "$MASTER_GPU_COUNT" -ge "$GPUS_PER_NODE" ]; then
|
||
echo "✓ (${MASTER_GPU_COUNT}张)"
|
||
else
|
||
echo "✗ 仅发现${MASTER_GPU_COUNT}张GPU"
|
||
exit 1
|
||
fi
|
||
|
||
echo -n " 检查worker节点GPU... "
|
||
WORKER_GPU_COUNT=$(ssh ${WORKER1_ADDR} "nvidia-smi --query-gpu=count --format=csv,noheader | head -1" 2>/dev/null || echo "0")
|
||
if [ "$WORKER_GPU_COUNT" -ge "$GPUS_PER_NODE" ]; then
|
||
echo "✓ (${WORKER_GPU_COUNT}张)"
|
||
else
|
||
echo "✗ 仅发现${WORKER_GPU_COUNT}张GPU"
|
||
exit 1
|
||
fi
|
||
|
||
# 检查数据集路径
|
||
echo -n " 检查master数据集... "
|
||
if ssh ${MASTER_ADDR} "[ -d /data/nuscenes ]"; then
|
||
echo "✓"
|
||
else
|
||
echo "✗ /data/nuscenes不存在"
|
||
exit 1
|
||
fi
|
||
|
||
echo -n " 检查worker数据集... "
|
||
if ssh ${WORKER1_ADDR} "[ -d /data/nuscenes ]"; then
|
||
echo "✓"
|
||
else
|
||
echo "✗ /data/nuscenes不存在"
|
||
exit 1
|
||
fi
|
||
|
||
echo ""
|
||
echo "✅ 环境检查通过!"
|
||
echo ""
|
||
|
||
# ============================================================================
|
||
# 训练配置
|
||
# ============================================================================
|
||
|
||
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml"
|
||
PRETRAINED_SWINT="/data/pretrained/swint-nuimages-pretrained.pth"
|
||
LOAD_FROM="/data/runs/phase4a_stage1/epoch_1.pth"
|
||
|
||
# 检查配置文件
|
||
if [ ! -f "$CONFIG" ]; then
|
||
echo "❌ 配置文件不存在: $CONFIG"
|
||
exit 1
|
||
fi
|
||
|
||
if [ ! -f "$PRETRAINED_SWINT" ]; then
|
||
echo "❌ 预训练模型不存在: $PRETRAINED_SWINT"
|
||
exit 1
|
||
fi
|
||
|
||
if [ ! -f "$LOAD_FROM" ]; then
|
||
echo "❌ 初始权重不存在: $LOAD_FROM"
|
||
exit 1
|
||
fi
|
||
|
||
# 日志文件
|
||
LOG_FILE="phase4a_stage1_multinode_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo "【训练参数】"
|
||
echo " 配置文件: $CONFIG"
|
||
echo " 预训练模型: $PRETRAINED_SWINT"
|
||
echo " 初始权重: $LOAD_FROM"
|
||
echo " Batch/GPU: 1"
|
||
echo " Workers/GPU: 0"
|
||
echo " 日志文件: $LOG_FILE"
|
||
echo ""
|
||
|
||
# ============================================================================
|
||
# 启动训练
|
||
# ============================================================================
|
||
|
||
echo "========================================================================"
|
||
echo "开始多机训练..."
|
||
echo "========================================================================"
|
||
echo ""
|
||
|
||
# 使用torchpack dist-run启动多机训练
|
||
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
|
||
PATH=/opt/conda/bin:$PATH \
|
||
PYTHONPATH=/workspace/bevfusion:$PYTHONPATH \
|
||
/opt/conda/bin/torchpack dist-run \
|
||
-np ${TOTAL_GPUS} \
|
||
-H ${NODES} \
|
||
/opt/conda/bin/python tools/train.py \
|
||
${CONFIG} \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint ${PRETRAINED_SWINT} \
|
||
--load_from ${LOAD_FROM} \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
TRAIN_EXIT_CODE=$?
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
if [ $TRAIN_EXIT_CODE -eq 0 ]; then
|
||
echo "✅ 训练完成!"
|
||
else
|
||
echo "❌ 训练异常退出 (exit code: $TRAIN_EXIT_CODE)"
|
||
fi
|
||
echo "========================================================================"
|
||
echo ""
|
||
echo "日志文件: $LOG_FILE"
|
||
echo "Checkpoints: /data/runs/phase4a_stage1/"
|
||
echo ""
|
||
|
||
exit $TRAIN_EXIT_CODE
|
||
|