148 lines
5.1 KiB
Bash
148 lines
5.1 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
|
|||
|
|
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training 🚀🚀🚀
|
|||
|
|
# 增强数值稳定性版本 - 包含NaN/inf检测和自动恢复机制
|
|||
|
|
|
|||
|
|
echo "🔧🔧🔧 数值稳定性增强训练启动脚本 🔧🔧🔧"
|
|||
|
|
echo "Phase 4B: RMT-PPAD Transformer分割集成"
|
|||
|
|
echo "时间: $(date)"
|
|||
|
|
echo "==========================================="
|
|||
|
|
|
|||
|
|
# 配置参数
|
|||
|
|
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
|
|||
|
|
WORK_DIR="/data/runs/phase4b_rmtppad_segmentation"
|
|||
|
|
GPUS=8
|
|||
|
|
PORT=29500
|
|||
|
|
|
|||
|
|
# 🔄🔄🔄 恢复训练配置 🔄🔄🔄
|
|||
|
|
RESUME_FROM="/workspace/bevfusion/runs/run-326653dc-41917c58/latest.pth"
|
|||
|
|
AUTO_RESUME=false # 是否自动从最新checkpoint恢复
|
|||
|
|
|
|||
|
|
echo "📋 训练配置:"
|
|||
|
|
echo " - 配置文件: $CONFIG_FILE"
|
|||
|
|
echo " - 工作目录: $WORK_DIR"
|
|||
|
|
echo " - GPU数量: $GPUS"
|
|||
|
|
echo " - 主端口: $PORT"
|
|||
|
|
if [ -f "$RESUME_FROM" ]; then
|
|||
|
|
echo " - 恢复模式: ✅ 从checkpoint恢复"
|
|||
|
|
echo " - Checkpoint: $RESUME_FROM"
|
|||
|
|
else
|
|||
|
|
echo " - 恢复模式: ❌ 从头开始训练"
|
|||
|
|
fi
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
# 创建工作目录
|
|||
|
|
mkdir -p $WORK_DIR
|
|||
|
|
|
|||
|
|
# 🔧🔧🔧 数值稳定性监控函数 🔧🔧🔧
|
|||
|
|
monitor_numerical_stability() {
|
|||
|
|
echo "📊 启动数值稳定性监控..."
|
|||
|
|
while true; do
|
|||
|
|
# 检查是否有新的数值问题
|
|||
|
|
if grep -q "Warning: Invalid values detected in cost matrix" $WORK_DIR/train.log 2>/dev/null; then
|
|||
|
|
echo "⚠️ 检测到数值问题,记录已保存到日志"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 检查训练是否仍在运行
|
|||
|
|
if ! pgrep -f "python.*train.py" > /dev/null; then
|
|||
|
|
echo "ℹ️ 训练进程已结束"
|
|||
|
|
break
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
sleep 60 # 每分钟检查一次
|
|||
|
|
done
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 启动监控(后台运行)
|
|||
|
|
monitor_numerical_stability &
|
|||
|
|
|
|||
|
|
# 🚀 启动分布式训练
|
|||
|
|
echo "🚀 启动8卡分布式训练..."
|
|||
|
|
|
|||
|
|
# 检查是否从checkpoint恢复
|
|||
|
|
RESUME_ARGS=""
|
|||
|
|
if [ -f "$RESUME_FROM" ]; then
|
|||
|
|
echo "✅ 找到checkpoint文件: $RESUME_FROM"
|
|||
|
|
RESUME_ARGS="--resume-from $RESUME_FROM"
|
|||
|
|
echo "🔄 将从checkpoint恢复训练"
|
|||
|
|
elif [ "$AUTO_RESUME" = true ]; then
|
|||
|
|
echo "🔍 启用自动恢复模式,查找最新checkpoint..."
|
|||
|
|
LATEST_CHECKPOINT=$(find /workspace/bevfusion/runs/ -name "latest.pth" -type l -exec ls -t {} + 2>/dev/null | head -1)
|
|||
|
|
if [ -n "$LATEST_CHECKPOINT" ]; then
|
|||
|
|
RESUME_ARGS="--resume-from $LATEST_CHECKPOINT"
|
|||
|
|
echo "🔄 自动恢复从: $LATEST_CHECKPOINT"
|
|||
|
|
fi
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# ✅ 设置环境变量 (参考成功脚本)
|
|||
|
|
export PATH=/opt/conda/bin:$PATH
|
|||
|
|
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
|||
|
|
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
|||
|
|
|
|||
|
|
# 设置GPU
|
|||
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|||
|
|
|
|||
|
|
# 验证环境
|
|||
|
|
echo ""
|
|||
|
|
echo "=== 环境验证 ==="
|
|||
|
|
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
|
|||
|
|
echo "❌ PyTorch导入失败"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
|
|||
|
|
echo "❌ mmcv导入失败"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
which torchpack || {
|
|||
|
|
echo "❌ torchpack未找到"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
echo "✅ torchpack: $(which torchpack)"
|
|||
|
|
|
|||
|
|
echo "命令: torchpack dist-run -np $GPUS /opt/conda/bin/python tools/train.py $CONFIG_FILE --load_from $RESUME_FROM --data.samples_per_gpu 1 --data.workers_per_gpu 0"
|
|||
|
|
|
|||
|
|
torchpack dist-run \
|
|||
|
|
-np $GPUS \
|
|||
|
|
/opt/conda/bin/python tools/train.py \
|
|||
|
|
$CONFIG_FILE \
|
|||
|
|
--load_from "$RESUME_FROM" \
|
|||
|
|
--data.samples_per_gpu 1 \
|
|||
|
|
--data.workers_per_gpu 0 \
|
|||
|
|
> "$WORK_DIR/train.log" 2>&1 &
|
|||
|
|
|
|||
|
|
# 获取进程ID
|
|||
|
|
TRAIN_PID=$!
|
|||
|
|
echo ""
|
|||
|
|
echo "✅ Phase 4B training started!"
|
|||
|
|
echo "📊 Check results in: $WORK_DIR"
|
|||
|
|
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
echo "══════════════════════════════════════════════════════════"
|
|||
|
|
echo "✅ 训练已在后台启动!"
|
|||
|
|
echo "══════════════════════════════════════════════════════════"
|
|||
|
|
echo ""
|
|||
|
|
echo "进程ID: $TRAIN_PID"
|
|||
|
|
echo "日志文件: $WORK_DIR/train.log"
|
|||
|
|
echo ""
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|||
|
|
echo "监控命令:"
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|||
|
|
echo ""
|
|||
|
|
echo "1. 查看实时日志:"
|
|||
|
|
echo " tail -f $WORK_DIR/train.log"
|
|||
|
|
echo ""
|
|||
|
|
echo "2. 查看关键指标:"
|
|||
|
|
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
|
|||
|
|
echo ""
|
|||
|
|
echo "3. 检查进程状态:"
|
|||
|
|
echo " ps aux | grep $TRAIN_PID"
|
|||
|
|
echo ""
|
|||
|
|
echo "4. GPU监控:"
|
|||
|
|
echo " nvidia-smi -l 5"
|
|||
|
|
echo ""
|
|||
|
|
echo "5. 停止训练:"
|
|||
|
|
echo " kill $TRAIN_PID"
|
|||
|
|
echo ""
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|