bev-project/archive_scripts/start_phase4a_bev2x.sh

95 lines
2.8 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
echo "========================================================================"
echo "Phase 4A: BEV 2x训练启动"
echo "========================================================================"
echo "配置: BEV分辨率0.15m (2倍提升)"
echo "Decoder: 4层完整版 [256, 256, 128, 128]"
echo "基础模型: epoch_23.pth"
echo "目标: 20 epochs"
echo "========================================================================"
# 1. 检查checkpoint
if [ ! -f "runs/enhanced_from_epoch19/epoch_23.pth" ]; then
echo "❌ 错误: epoch_23.pth不存在!"
exit 1
fi
echo "✅ Checkpoint存在: epoch_23.pth"
# 2. 检查配置文件
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml"
if [ ! -f "$CONFIG" ]; then
echo "❌ 错误: 配置文件不存在!"
exit 1
fi
echo "✅ 配置文件存在"
# 3. 创建运行目录
mkdir -p runs/phase4a_bev2x
echo "✅ 运行目录创建"
# 4. 检查GPU
echo ""
echo "GPU状态:"
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
echo ""
# 5. 确认启动
echo "========================================================================"
echo "即将启动训练,配置信息:"
echo " - GPU数量: 6"
echo " - Batch size: 1 per GPU"
echo " - Workers: 0"
echo " - 学习率: 2e-5"
echo " - Epochs: 20"
echo " - 预计时间: ~12.5天"
echo " - 预计显存: ~28-29GB/GPU"
echo "========================================================================"
echo ""
read -p "确认启动? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "取消启动"
exit 0
fi
# 6. 启动训练
echo ""
echo "🚀 启动训练..."
echo ""
nohup torchpack dist-run -np 6 python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint=pretrained/swint-nuimages-pretrained.pth \
--load_from runs/enhanced_from_epoch19/epoch_23.pth \
--run-dir runs/phase4a_bev2x \
> phase4a_bev2x.log 2>&1 &
TRAIN_PID=$!
echo "✅ 训练已启动"
echo ""
echo "========================================================================"
echo "训练信息:"
echo " PID: $TRAIN_PID"
echo " 日志: phase4a_bev2x.log"
echo " 输出: runs/phase4a_bev2x/"
echo ""
echo "监控命令:"
echo " 实时日志: tail -f phase4a_bev2x.log"
echo " GPU状态: watch -n 60 nvidia-smi"
echo " 训练进度: watch -n 300 'tail -30 phase4a_bev2x.log | grep Epoch'"
echo ""
echo "检查命令:"
echo " 检查进程: ps aux | grep $TRAIN_PID"
echo " 检查checkpoint: ls -lh runs/phase4a_bev2x/*.pth"
echo "========================================================================"
echo ""
echo "Phase 4A训练已开始预计2025年11月11日完成"
echo ""