bev-project/START_PHASE4B_RMTPPAD_SEGME...

127 lines
4.4 KiB
Bash
Raw Permalink Normal View History

2025-11-21 10:50:51 +08:00
#!/bin/bash
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training (Resume) 🚀🚀🚀
# 从最新的checkpoint恢复训练 - 包含数值稳定性修复
#!/bin/bash
# BEVFusion Phase 4B: RMT-PPAD Segmentation Integration
# 集成RMT-PPAD的Transformer分割解码器
# 脚本配置
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
WORK_DIR="runs/phase4b_rmtppad_segmentation"
GPUS=8
PORT=29501
echo "🚀 Starting BEVFusion Phase 4B: RMT-PPAD Segmentation Integration"
echo "📁 Config: $CONFIG_FILE"
echo "💾 Work Dir: $WORK_DIR"
echo "🎮 GPUs: $GPUS"
echo "🔌 Port: $PORT"
echo ""
# 创建工作目录
mkdir -p $WORK_DIR
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 🔄🔄🔄 Resume训练加载最新的完整checkpoint 🔄🔄🔄
echo ""
echo "=== Checkpoint检查 (Resume训练) ==="
LOAD_CKPT="/workspace/bevfusion/runs/run-4c8ec7e5-f3215f6d/epoch_1.pth"
if [ -f "$LOAD_CKPT" ]; then
# Resume训练加载完整checkpoint包括所有模块
LOAD_FROM="--load-from $LOAD_CKPT"
echo "🔄 Resume训练加载完整checkpoint"
echo "📄 加载: 所有模块 (骨干网络 + BEV特征 + 检测头 + 分割头)"
echo "📄 目的: 从Epoch 3继续训练"
echo "📄 Checkpoint: $LOAD_CKPT"
echo "📊 Checkpoint详情:"
ls -lh "$LOAD_CKPT"
# 检查checkpoint时间戳
if [[ "$LOAD_CKPT" =~ epoch_([0-9]+) ]]; then
EPOCH_NUM="${BASH_REMATCH[1]}"
echo "📈 Checkpoint来自Epoch: $EPOCH_NUM"
fi
echo "🕒 修改时间: $(stat -c '%y' "$LOAD_CKPT" | cut -d'.' -f1)"
else
LOAD_FROM=""
LOAD_CKPT=""
echo "❌ 未找到checkpoint文件: $LOAD_CKPT"
echo "⚠️ 将从头开始训练"
fi
# 启动训练
echo ""
echo "🎯 Starting training..."
nohup torchpack dist-run \
-np 8 \
/opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml \
--load_from "$LOAD_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$WORK_DIR/train.log" 2>&1 &
获取进程ID
TRAIN_PID=$!
echo ""
echo "✅ Phase 4B training completed!"
echo "📊 Check results in: $WORK_DIR"
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $WORK_DIR/train.log"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $WORK_DIR/train.log"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"