bev-project/START_PHASE4B_RMTPPAD_SEGME...

127 lines
4.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training (Resume) 🚀🚀🚀
# 从最新的checkpoint恢复训练 - 包含数值稳定性修复
#!/bin/bash
# BEVFusion Phase 4B: RMT-PPAD Segmentation Integration
# 集成RMT-PPAD的Transformer分割解码器
# 脚本配置
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
WORK_DIR="runs/phase4b_rmtppad_segmentation"
GPUS=8
PORT=29501
echo "🚀 Starting BEVFusion Phase 4B: RMT-PPAD Segmentation Integration"
echo "📁 Config: $CONFIG_FILE"
echo "💾 Work Dir: $WORK_DIR"
echo "🎮 GPUs: $GPUS"
echo "🔌 Port: $PORT"
echo ""
# 创建工作目录
mkdir -p $WORK_DIR
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 🔄🔄🔄 Resume训练加载最新的完整checkpoint 🔄🔄🔄
echo ""
echo "=== Checkpoint检查 (Resume训练) ==="
LOAD_CKPT="/workspace/bevfusion/runs/run-4c8ec7e5-f3215f6d/epoch_1.pth"
if [ -f "$LOAD_CKPT" ]; then
# Resume训练加载完整checkpoint包括所有模块
LOAD_FROM="--load-from $LOAD_CKPT"
echo "🔄 Resume训练加载完整checkpoint"
echo "📄 加载: 所有模块 (骨干网络 + BEV特征 + 检测头 + 分割头)"
echo "📄 目的: 从Epoch 3继续训练"
echo "📄 Checkpoint: $LOAD_CKPT"
echo "📊 Checkpoint详情:"
ls -lh "$LOAD_CKPT"
# 检查checkpoint时间戳
if [[ "$LOAD_CKPT" =~ epoch_([0-9]+) ]]; then
EPOCH_NUM="${BASH_REMATCH[1]}"
echo "📈 Checkpoint来自Epoch: $EPOCH_NUM"
fi
echo "🕒 修改时间: $(stat -c '%y' "$LOAD_CKPT" | cut -d'.' -f1)"
else
LOAD_FROM=""
LOAD_CKPT=""
echo "❌ 未找到checkpoint文件: $LOAD_CKPT"
echo "⚠️ 将从头开始训练"
fi
# 启动训练
echo ""
echo "🎯 Starting training..."
nohup torchpack dist-run \
-np 8 \
/opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml \
--load_from "$LOAD_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$WORK_DIR/train.log" 2>&1 &
获取进程ID
TRAIN_PID=$!
echo ""
echo "✅ Phase 4B training completed!"
echo "📊 Check results in: $WORK_DIR"
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $WORK_DIR/train.log"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $WORK_DIR/train.log"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"