bev-project/scripts/training/START_PHASE4B_RMTPPAD_SEGME...

111 lines
4.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# BEVFusion Phase 4B: RMT-PPAD Segmentation Integration
# 集成RMT-PPAD的Transformer分割解码器
# 脚本配置
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
WORK_DIR="runs/phase4b_rmtppad_segmentation"
GPUS=8
PORT=29501
echo "🚀 Starting BEVFusion Phase 4B: RMT-PPAD Segmentation Integration"
echo "📁 Config: $CONFIG_FILE"
echo "💾 Work Dir: $WORK_DIR"
echo "🎮 GPUs: $GPUS"
echo "🔌 Port: $PORT"
echo ""
# 创建工作目录
mkdir -p $WORK_DIR
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 检查checkpoint - Phase 4B: 选择性加载,只加载骨干+BEV+检测头,跳过分割头
echo ""
echo "=== Checkpoint检查 (Phase 4B) ==="
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-74fea435/epoch_2.pth"
if [ -f "$LOAD_CKPT" ]; then
# Phase 4B: 选择性加载 - 加载骨干、BEV特征和检测头跳过分割头
LOAD_FROM="--load-from $LOAD_CKPT"
echo "🔧 Phase 4B: 选择性加载checkpoint"
echo "📄 加载: 骨干网络 + BEV特征 + 检测头"
echo "📄 跳过: 分割头 (使用随机初始化)"
echo "📄 Checkpoint: $LOAD_CKPT"
else
LOAD_FROM=""
echo "⚠️ 未找到checkpoint文件从头开始训练"
fi
ls -lh "$LOAD_CKPT" 2>/dev/null || echo "checkpoint文件不存在"
# 启动训练
echo ""
echo "🎯 Starting training..."
nohup torchpack dist-run \
-np 8 \
/opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml \
--load_from "$LOAD_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$WORK_DIR/train.log" 2>&1 &
获取进程ID
TRAIN_PID=$!
echo ""
echo "✅ Phase 4B training completed!"
echo "📊 Check results in: $WORK_DIR"
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $WORK_DIR/train.log"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $WORK_DIR/train.log"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"