bev-project/START_PHASE4B_RMTPPAD_SEGME...

148 lines
5.1 KiB
Bash
Raw Permalink Normal View History

2025-11-21 10:50:51 +08:00
#!/bin/bash
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training 🚀🚀🚀
# 增强数值稳定性版本 - 包含NaN/inf检测和自动恢复机制
echo "🔧🔧🔧 数值稳定性增强训练启动脚本 🔧🔧🔧"
echo "Phase 4B: RMT-PPAD Transformer分割集成"
echo "时间: $(date)"
echo "==========================================="
# 配置参数
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
WORK_DIR="/data/runs/phase4b_rmtppad_segmentation"
GPUS=8
PORT=29500
# 🔄🔄🔄 恢复训练配置 🔄🔄🔄
RESUME_FROM="/workspace/bevfusion/runs/run-326653dc-41917c58/latest.pth"
AUTO_RESUME=false # 是否自动从最新checkpoint恢复
echo "📋 训练配置:"
echo " - 配置文件: $CONFIG_FILE"
echo " - 工作目录: $WORK_DIR"
echo " - GPU数量: $GPUS"
echo " - 主端口: $PORT"
if [ -f "$RESUME_FROM" ]; then
echo " - 恢复模式: ✅ 从checkpoint恢复"
echo " - Checkpoint: $RESUME_FROM"
else
echo " - 恢复模式: ❌ 从头开始训练"
fi
echo ""
# 创建工作目录
mkdir -p $WORK_DIR
# 🔧🔧🔧 数值稳定性监控函数 🔧🔧🔧
monitor_numerical_stability() {
echo "📊 启动数值稳定性监控..."
while true; do
# 检查是否有新的数值问题
if grep -q "Warning: Invalid values detected in cost matrix" $WORK_DIR/train.log 2>/dev/null; then
echo "⚠️ 检测到数值问题,记录已保存到日志"
fi
# 检查训练是否仍在运行
if ! pgrep -f "python.*train.py" > /dev/null; then
echo " 训练进程已结束"
break
fi
sleep 60 # 每分钟检查一次
done
}
# 启动监控(后台运行)
monitor_numerical_stability &
# 🚀 启动分布式训练
echo "🚀 启动8卡分布式训练..."
# 检查是否从checkpoint恢复
RESUME_ARGS=""
if [ -f "$RESUME_FROM" ]; then
echo "✅ 找到checkpoint文件: $RESUME_FROM"
RESUME_ARGS="--resume-from $RESUME_FROM"
echo "🔄 将从checkpoint恢复训练"
elif [ "$AUTO_RESUME" = true ]; then
echo "🔍 启用自动恢复模式查找最新checkpoint..."
LATEST_CHECKPOINT=$(find /workspace/bevfusion/runs/ -name "latest.pth" -type l -exec ls -t {} + 2>/dev/null | head -1)
if [ -n "$LATEST_CHECKPOINT" ]; then
RESUME_ARGS="--resume-from $LATEST_CHECKPOINT"
echo "🔄 自动恢复从: $LATEST_CHECKPOINT"
fi
fi
# ✅ 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
echo "命令: torchpack dist-run -np $GPUS /opt/conda/bin/python tools/train.py $CONFIG_FILE --load_from $RESUME_FROM --data.samples_per_gpu 1 --data.workers_per_gpu 0"
torchpack dist-run \
-np $GPUS \
/opt/conda/bin/python tools/train.py \
$CONFIG_FILE \
--load_from "$RESUME_FROM" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$WORK_DIR/train.log" 2>&1 &
# 获取进程ID
TRAIN_PID=$!
echo ""
echo "✅ Phase 4B training started!"
echo "📊 Check results in: $WORK_DIR"
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $WORK_DIR/train.log"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $WORK_DIR/train.log"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"