111 lines
4.0 KiB
Bash
Executable File
111 lines
4.0 KiB
Bash
Executable File
#!/bin/bash
|
||
# BEVFusion Phase 4B: RMT-PPAD Segmentation Integration
|
||
# 集成RMT-PPAD的Transformer分割解码器
|
||
|
||
# 脚本配置
|
||
CONFIG_FILE="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
|
||
WORK_DIR="runs/phase4b_rmtppad_segmentation"
|
||
GPUS=8
|
||
PORT=29501
|
||
|
||
echo "🚀 Starting BEVFusion Phase 4B: RMT-PPAD Segmentation Integration"
|
||
echo "📁 Config: $CONFIG_FILE"
|
||
echo "💾 Work Dir: $WORK_DIR"
|
||
echo "🎮 GPUs: $GPUS"
|
||
echo "🔌 Port: $PORT"
|
||
echo ""
|
||
|
||
# 创建工作目录
|
||
mkdir -p $WORK_DIR
|
||
|
||
# ✅ 关键: 设置环境变量 (参考成功脚本)
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
|
||
|
||
# 设置GPU
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||
|
||
# 验证环境
|
||
echo ""
|
||
echo "=== 环境验证 ==="
|
||
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
|
||
echo "❌ PyTorch导入失败"
|
||
exit 1
|
||
}
|
||
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
|
||
echo "❌ mmcv导入失败"
|
||
exit 1
|
||
}
|
||
which torchpack || {
|
||
echo "❌ torchpack未找到"
|
||
exit 1
|
||
}
|
||
echo "✅ torchpack: $(which torchpack)"
|
||
|
||
# 检查checkpoint - Phase 4B: 选择性加载,只加载骨干+BEV+检测头,跳过分割头
|
||
echo ""
|
||
echo "=== Checkpoint检查 (Phase 4B) ==="
|
||
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-74fea435/epoch_2.pth"
|
||
if [ -f "$LOAD_CKPT" ]; then
|
||
# Phase 4B: 选择性加载 - 加载骨干、BEV特征和检测头,跳过分割头
|
||
LOAD_FROM="--load-from $LOAD_CKPT"
|
||
echo "🔧 Phase 4B: 选择性加载checkpoint"
|
||
echo "📄 加载: 骨干网络 + BEV特征 + 检测头"
|
||
echo "📄 跳过: 分割头 (使用随机初始化)"
|
||
echo "📄 Checkpoint: $LOAD_CKPT"
|
||
else
|
||
LOAD_FROM=""
|
||
echo "⚠️ 未找到checkpoint文件,从头开始训练"
|
||
fi
|
||
ls -lh "$LOAD_CKPT" 2>/dev/null || echo "checkpoint文件不存在"
|
||
|
||
# 启动训练
|
||
echo ""
|
||
echo "🎯 Starting training..."
|
||
|
||
nohup torchpack dist-run \
|
||
-np 8 \
|
||
/opt/conda/bin/python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4b_rmtppad_segmentation.yaml \
|
||
--load_from "$LOAD_CKPT" \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
> "$WORK_DIR/train.log" 2>&1 &
|
||
|
||
获取进程ID
|
||
TRAIN_PID=$!
|
||
echo ""
|
||
echo "✅ Phase 4B training completed!"
|
||
echo "📊 Check results in: $WORK_DIR"
|
||
echo "📈 Monitor progress: tail -f $WORK_DIR/train.log"
|
||
|
||
echo ""
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo "✅ 训练已在后台启动!"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo ""
|
||
echo "进程ID: $TRAIN_PID"
|
||
echo "日志文件: $WORK_DIR/train.log"
|
||
echo ""
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "监控命令:"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo ""
|
||
echo "1. 查看实时日志:"
|
||
echo " tail -f $WORK_DIR/train.log"
|
||
echo ""
|
||
echo "2. 查看关键指标:"
|
||
echo " tail -f $WORK_DIR/train.log | grep -E 'Epoch|loss/map/divider|loss/object'"
|
||
echo ""
|
||
echo "3. 检查进程状态:"
|
||
echo " ps aux | grep $TRAIN_PID"
|
||
echo ""
|
||
echo "4. GPU监控:"
|
||
echo " nvidia-smi -l 5"
|
||
echo ""
|
||
echo "5. 停止训练:"
|
||
echo " kill $TRAIN_PID"
|
||
echo ""
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" |