127 lines
4.4 KiB
Bash
127 lines
4.4 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
|
|||
|
|
# 🚀🚀🚀 BEVFusion Phase 4B: RMT-PPAD Segmentation Training (Resume) 🚀🚀🚀
|
|||
|
|
# 从最新的checkpoint恢复训练 - 包含数值稳定性修复
|
|||
|
|
|
|||
|
|
|
|||
|
|
#!/bin/bash
|
|||
|
|
# BEVFusion Phase 4B: RMT-PPAD Segmentation Integration
|
|||
|
|
# 集成RMT-PPAD的Transformer分割解码器
|
|||
|
|
|
|||
|
|
# 脚本配置
|
|||
|
|
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
|
|||
|
|
WORK_DIR="runs/phase4b_rmtppad_segmentation"
|
|||
|
|
GPUS=8
|
|||
|
|
PORT=29501
|
|||
|
|
|
|||
|
|
echo "🚀 Starting BEVFusion Phase 4B: RMT-PPAD Segmentation Integration"
|
|||
|
|
echo "📁 Config: $CONFIG_FILE"
|
|||
|
|
echo "💾 Work Dir: $WORK_DIR"
|
|||
|
|
echo "🎮 GPUs: $GPUS"
|
|||
|
|
echo "🔌 Port: $PORT"
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
# 创建工作目录
|
|||
|
|
mkdir -p $WORK_DIR
|
|||
|
|
|
|||
|
|
# ✅ 关键: 设置环境变量 (参考成功脚本)
|
|||
|
|
export PATH=/opt/conda/bin:$PATH
|
|||
|
|
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
|||
|
|
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
|||
|
|
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
|
|||
|
|
|
|||
|
|
# 设置GPU
|
|||
|
|
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
|||
|
|
|
|||
|
|
# 验证环境
|
|||
|
|
echo ""
|
|||
|
|
echo "=== 环境验证 ==="
|
|||
|
|
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
|
|||
|
|
echo "❌ PyTorch导入失败"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
|
|||
|
|
echo "❌ mmcv导入失败"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
which torchpack || {
|
|||
|
|
echo "❌ torchpack未找到"
|
|||
|
|
exit 1
|
|||
|
|
}
|
|||
|
|
echo "✅ torchpack: $(which torchpack)"
|
|||
|
|
|
|||
|
|
# 🔄🔄🔄 Resume训练:加载最新的完整checkpoint 🔄🔄🔄
|
|||
|
|
echo ""
|
|||
|
|
echo "=== Checkpoint检查 (Resume训练) ==="
|
|||
|
|
LOAD_CKPT="/workspace/bevfusion/runs/run-4c8ec7e5-f3215f6d/epoch_1.pth"
|
|||
|
|
if [ -f "$LOAD_CKPT" ]; then
|
|||
|
|
# Resume训练:加载完整checkpoint,包括所有模块
|
|||
|
|
LOAD_FROM="--load-from $LOAD_CKPT"
|
|||
|
|
echo "🔄 Resume训练:加载完整checkpoint"
|
|||
|
|
echo "📄 加载: 所有模块 (骨干网络 + BEV特征 + 检测头 + 分割头)"
|
|||
|
|
echo "📄 目的: 从Epoch 3继续训练"
|
|||
|
|
echo "📄 Checkpoint: $LOAD_CKPT"
|
|||
|
|
echo "📊 Checkpoint详情:"
|
|||
|
|
ls -lh "$LOAD_CKPT"
|
|||
|
|
|
|||
|
|
# 检查checkpoint时间戳
|
|||
|
|
if [[ "$LOAD_CKPT" =~ epoch_([0-9]+) ]]; then
|
|||
|
|
EPOCH_NUM="${BASH_REMATCH[1]}"
|
|||
|
|
echo "📈 Checkpoint来自Epoch: $EPOCH_NUM"
|
|||
|
|
fi
|
|||
|
|
echo "🕒 修改时间: $(stat -c '%y' "$LOAD_CKPT" | cut -d'.' -f1)"
|
|||
|
|
else
|
|||
|
|
LOAD_FROM=""
|
|||
|
|
LOAD_CKPT=""
|
|||
|
|
echo "❌ 未找到checkpoint文件: $LOAD_CKPT"
|
|||
|
|
echo "⚠️ 将从头开始训练"
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 启动训练
|
|||
|
|
echo ""
|
|||
|
|
echo "🎯 Starting training..."
|
|||
|
|
|
|||
|
|
nohup torchpack dist-run \
|
|||
|
|
-np 8 \
|
|||
|
|
/opt/conda/bin/python tools/train.py \
|
|||
|
|
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml \
|
|||
|
|
--load_from "$LOAD_CKPT" \
|
|||
|
|
--data.samples_per_gpu 1 \
|
|||
|
|
--data.workers_per_gpu 0 \
|
|||
|
|
> "$WORK_DIR/train.log" 2>&1 &
|
|||
|
|
|
|||
|
|
获取进程ID
|
|||
|
|
TRAIN_PID=$!
|
|||
|
|
echo ""
|
|||
|
|
echo "✅ Phase 4B training completed!"
|
|||
|
|
echo "📊 Check results in: $WORK_DIR"
|
|||
|
|
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
|
|||
|
|
|
|||
|
|
echo ""
|
|||
|
|
echo "══════════════════════════════════════════════════════════"
|
|||
|
|
echo "✅ 训练已在后台启动!"
|
|||
|
|
echo "══════════════════════════════════════════════════════════"
|
|||
|
|
echo ""
|
|||
|
|
echo "进程ID: $TRAIN_PID"
|
|||
|
|
echo "日志文件: $WORK_DIR/train.log"
|
|||
|
|
echo ""
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|||
|
|
echo "监控命令:"
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
|||
|
|
echo ""
|
|||
|
|
echo "1. 查看实时日志:"
|
|||
|
|
echo " tail -f $WORK_DIR/train.log"
|
|||
|
|
echo ""
|
|||
|
|
echo "2. 查看关键指标:"
|
|||
|
|
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
|
|||
|
|
echo ""
|
|||
|
|
echo "3. 检查进程状态:"
|
|||
|
|
echo " ps aux | grep $TRAIN_PID"
|
|||
|
|
echo ""
|
|||
|
|
echo "4. GPU监控:"
|
|||
|
|
echo " nvidia-smi -l 5"
|
|||
|
|
echo ""
|
|||
|
|
echo "5. 停止训练:"
|
|||
|
|
echo " kill $TRAIN_PID"
|
|||
|
|
echo ""
|
|||
|
|
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|