95 lines
2.8 KiB
Bash
95 lines
2.8 KiB
Bash
|
|
#!/bin/bash
|
|||
|
|
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo "Phase 4A: BEV 2x训练启动"
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo "配置: BEV分辨率0.15m (2倍提升)"
|
|||
|
|
echo "Decoder: 4层完整版 [256, 256, 128, 128]"
|
|||
|
|
echo "基础模型: epoch_23.pth"
|
|||
|
|
echo "目标: 20 epochs"
|
|||
|
|
echo "========================================================================"
|
|||
|
|
|
|||
|
|
# 1. 检查checkpoint
|
|||
|
|
if [ ! -f "runs/enhanced_from_epoch19/epoch_23.pth" ]; then
|
|||
|
|
echo "❌ 错误: epoch_23.pth不存在!"
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
echo "✅ Checkpoint存在: epoch_23.pth"
|
|||
|
|
|
|||
|
|
# 2. 检查配置文件
|
|||
|
|
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml"
|
|||
|
|
if [ ! -f "$CONFIG" ]; then
|
|||
|
|
echo "❌ 错误: 配置文件不存在!"
|
|||
|
|
exit 1
|
|||
|
|
fi
|
|||
|
|
echo "✅ 配置文件存在"
|
|||
|
|
|
|||
|
|
# 3. 创建运行目录
|
|||
|
|
mkdir -p runs/phase4a_bev2x
|
|||
|
|
echo "✅ 运行目录创建"
|
|||
|
|
|
|||
|
|
# 4. 检查GPU
|
|||
|
|
echo ""
|
|||
|
|
echo "GPU状态:"
|
|||
|
|
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
# 5. 确认启动
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo "即将启动训练,配置信息:"
|
|||
|
|
echo " - GPU数量: 6"
|
|||
|
|
echo " - Batch size: 1 per GPU"
|
|||
|
|
echo " - Workers: 0"
|
|||
|
|
echo " - 学习率: 2e-5"
|
|||
|
|
echo " - Epochs: 20"
|
|||
|
|
echo " - 预计时间: ~12.5天"
|
|||
|
|
echo " - 预计显存: ~28-29GB/GPU"
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
read -p "确认启动? (y/n) " -n 1 -r
|
|||
|
|
echo
|
|||
|
|
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
|||
|
|
echo "取消启动"
|
|||
|
|
exit 0
|
|||
|
|
fi
|
|||
|
|
|
|||
|
|
# 6. 启动训练
|
|||
|
|
echo ""
|
|||
|
|
echo "🚀 启动训练..."
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
nohup torchpack dist-run -np 6 python tools/train.py \
|
|||
|
|
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml \
|
|||
|
|
--model.encoders.camera.backbone.init_cfg.checkpoint=pretrained/swint-nuimages-pretrained.pth \
|
|||
|
|
--load_from runs/enhanced_from_epoch19/epoch_23.pth \
|
|||
|
|
--run-dir runs/phase4a_bev2x \
|
|||
|
|
> phase4a_bev2x.log 2>&1 &
|
|||
|
|
|
|||
|
|
TRAIN_PID=$!
|
|||
|
|
|
|||
|
|
echo "✅ 训练已启动"
|
|||
|
|
echo ""
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo "训练信息:"
|
|||
|
|
echo " PID: $TRAIN_PID"
|
|||
|
|
echo " 日志: phase4a_bev2x.log"
|
|||
|
|
echo " 输出: runs/phase4a_bev2x/"
|
|||
|
|
echo ""
|
|||
|
|
echo "监控命令:"
|
|||
|
|
echo " 实时日志: tail -f phase4a_bev2x.log"
|
|||
|
|
echo " GPU状态: watch -n 60 nvidia-smi"
|
|||
|
|
echo " 训练进度: watch -n 300 'tail -30 phase4a_bev2x.log | grep Epoch'"
|
|||
|
|
echo ""
|
|||
|
|
echo "检查命令:"
|
|||
|
|
echo " 检查进程: ps aux | grep $TRAIN_PID"
|
|||
|
|
echo " 检查checkpoint: ls -lh runs/phase4a_bev2x/*.pth"
|
|||
|
|
echo "========================================================================"
|
|||
|
|
echo ""
|
|||
|
|
echo "Phase 4A训练已开始,预计2025年11月11日完成"
|
|||
|
|
echo ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
|
|||
|
|
|