bev-project/START_PHASE4B_RMTPPAD_SEGME...

148 lines
5.1 KiB
Bash
Executable File
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training 🚀🚀🚀
# 增强数值稳定性版本 - 包含NaN/inf检测和自动恢复机制
echo "🔧🔧🔧 数值稳定性增强训练启动脚本 🔧🔧🔧"
echo "Phase 4B: RMT-PPAD Transformer分割集成"
echo "时间: $(date)"
echo "==========================================="
# 配置参数
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
WORK_DIR="/data/runs/phase4b_rmtppad_segmentation"
GPUS=8
PORT=29500
# 🔄🔄🔄 恢复训练配置 🔄🔄🔄
RESUME_FROM="/workspace/bevfusion/runs/run-326653dc-41917c58/latest.pth"
AUTO_RESUME=false # 是否自动从最新checkpoint恢复
echo "📋 训练配置:"
echo " - 配置文件: $CONFIG_FILE"
echo " - 工作目录: $WORK_DIR"
echo " - GPU数量: $GPUS"
echo " - 主端口: $PORT"
if [ -f "$RESUME_FROM" ]; then
echo " - 恢复模式: ✅ 从checkpoint恢复"
echo " - Checkpoint: $RESUME_FROM"
else
echo " - 恢复模式: ❌ 从头开始训练"
fi
echo ""
# 创建工作目录
mkdir -p $WORK_DIR
# 🔧🔧🔧 数值稳定性监控函数 🔧🔧🔧
monitor_numerical_stability() {
echo "📊 启动数值稳定性监控..."
while true; do
# 检查是否有新的数值问题
if grep -q "Warning: Invalid values detected in cost matrix" $WORK_DIR/train.log 2>/dev/null; then
echo "⚠️ 检测到数值问题,记录已保存到日志"
fi
# 检查训练是否仍在运行
if ! pgrep -f "python.*train.py" > /dev/null; then
echo " 训练进程已结束"
break
fi
sleep 60 # 每分钟检查一次
done
}
# 启动监控(后台运行)
monitor_numerical_stability &
# 🚀 启动分布式训练
echo "🚀 启动8卡分布式训练..."
# 检查是否从checkpoint恢复
RESUME_ARGS=""
if [ -f "$RESUME_FROM" ]; then
echo "✅ 找到checkpoint文件: $RESUME_FROM"
RESUME_ARGS="--resume-from $RESUME_FROM"
echo "🔄 将从checkpoint恢复训练"
elif [ "$AUTO_RESUME" = true ]; then
echo "🔍 启用自动恢复模式查找最新checkpoint..."
LATEST_CHECKPOINT=$(find /workspace/bevfusion/runs/ -name "latest.pth" -type l -exec ls -t {} + 2>/dev/null | head -1)
if [ -n "$LATEST_CHECKPOINT" ]; then
RESUME_ARGS="--resume-from $LATEST_CHECKPOINT"
echo "🔄 自动恢复从: $LATEST_CHECKPOINT"
fi
fi
# ✅ 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
echo "命令: torchpack dist-run -np $GPUS /opt/conda/bin/python tools/train.py $CONFIG_FILE --load_from $RESUME_FROM --data.samples_per_gpu 1 --data.workers_per_gpu 0"
torchpack dist-run \
-np $GPUS \
/opt/conda/bin/python tools/train.py \
$CONFIG_FILE \
--load_from "$RESUME_FROM" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$WORK_DIR/train.log" 2>&1 &
# 获取进程ID
TRAIN_PID=$!
echo ""
echo "✅ Phase 4B training started!"
echo "📊 Check results in: $WORK_DIR"
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $WORK_DIR/train.log"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $WORK_DIR/train.log"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"