141 lines
4.4 KiB
Bash
Executable File
141 lines
4.4 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Phase 4A Stage 1 - 共享BEV层GCA训练启动脚本
|
||
# 日期: 2025-11-06
|
||
# 特性:
|
||
# 1. ✨ 共享BEV层GCA (检测和分割都受益)
|
||
# 2. Validation样本减少50% (6019→3010)
|
||
# 3. Evaluation频率降低 (interval=10)
|
||
# 4. 从epoch_5继续训练
|
||
|
||
set -e
|
||
|
||
echo "=============================================="
|
||
echo "Phase 4A Stage 1 - 共享BEV层GCA优化版"
|
||
echo "=============================================="
|
||
|
||
# 环境检查
|
||
echo ""
|
||
echo "=== 1. 环境检查 ==="
|
||
if [ ! -d "/workspace/bevfusion" ]; then
|
||
echo "❌ 错误: /workspace/bevfusion 不存在"
|
||
exit 1
|
||
fi
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
# 检查磁盘空间
|
||
echo ""
|
||
echo "=== 2. 磁盘空间检查 ==="
|
||
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
|
||
echo "可用空间: ${AVAIL_GB}GB"
|
||
|
||
if [ "$AVAIL_GB" -lt 30 ]; then
|
||
echo "⚠️ 警告: 磁盘空间不足30GB"
|
||
echo "建议: 清理.eval_hook缓存"
|
||
echo "执行: rm -rf /workspace/bevfusion/runs/*/.eval_hook/"
|
||
read -p "是否继续? (y/n) " -n 1 -r
|
||
echo
|
||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||
exit 1
|
||
fi
|
||
fi
|
||
|
||
# 检查checkpoint
|
||
echo ""
|
||
echo "=== 3. Checkpoint检查 ==="
|
||
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
|
||
if [ ! -f "$LATEST_CKPT" ]; then
|
||
echo "❌ 错误: 未找到 epoch_5.pth"
|
||
echo "尝试查找最新checkpoint..."
|
||
LATEST_CKPT=$(ls -t /workspace/bevfusion/runs/run-326653dc-2334d461/epoch_*.pth 2>/dev/null | head -1)
|
||
if [ -z "$LATEST_CKPT" ]; then
|
||
echo "❌ 未找到任何checkpoint"
|
||
exit 1
|
||
fi
|
||
fi
|
||
echo "✅ 使用checkpoint: $LATEST_CKPT"
|
||
|
||
# 显示配置摘要
|
||
echo ""
|
||
echo "=== 4. 训练配置摘要 ==="
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "配置文件: multitask_BEV2X_phase4a_stage1_gca.yaml"
|
||
echo "输出目录: /data/runs/phase4a_stage1_gca"
|
||
echo "起始epoch: 5"
|
||
echo "目标epoch: 20"
|
||
echo "剩余epochs: 15"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "学习率: 2.0e-5"
|
||
echo "BEV分辨率: 360×360 → 600×600"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "✨ 新特性:"
|
||
echo " 1. 共享BEV层GCA (in_channels=512, reduction=4)"
|
||
echo " 位置: Decoder Neck之后,任务头之前"
|
||
echo " 受益: 检测头 ✅ + 分割头 ✅"
|
||
echo ""
|
||
echo " 2. Validation样本优化"
|
||
echo " 原样本: 6,019个"
|
||
echo " 新样本: 3,010个 (load_interval=2)"
|
||
echo " 减少: 50%"
|
||
echo ""
|
||
echo " 3. Evaluation频率优化"
|
||
echo " 原频率: 每5 epochs"
|
||
echo " 新频率: 每10 epochs"
|
||
echo " 减少: 50%"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
|
||
# GPU检查
|
||
echo ""
|
||
echo "=== 5. GPU状态 ==="
|
||
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
|
||
|
||
# 清理旧的.eval_hook (预防)
|
||
echo ""
|
||
echo "=== 6. 清理旧缓存 ==="
|
||
EVAL_HOOK_COUNT=$(find /workspace/bevfusion/runs -name ".eval_hook" -type d 2>/dev/null | wc -l)
|
||
if [ "$EVAL_HOOK_COUNT" -gt 0 ]; then
|
||
echo "发现 $EVAL_HOOK_COUNT 个.eval_hook目录,正在删除..."
|
||
find /workspace/bevfusion/runs -name ".eval_hook" -type d -exec rm -rf {} \; 2>/dev/null || true
|
||
echo "✅ 已清理"
|
||
else
|
||
echo "✅ 无需清理"
|
||
fi
|
||
|
||
# 确认启动
|
||
echo ""
|
||
echo "=== 7. 准备启动训练 ==="
|
||
echo "即将开始训练,剩余 15 epochs (epoch 6-20)"
|
||
echo "预计完成时间: ~7天 (FP32)"
|
||
echo ""
|
||
echo "架构亮点:"
|
||
echo " ✨ 共享BEV层GCA - 检测和分割都用全局增强的BEV特征"
|
||
echo " 📉 Evaluation开销减少75%"
|
||
echo " 🎯 目标: Divider Dice < 0.45, mAP > 0.69"
|
||
echo ""
|
||
read -p "确认启动? (y/n) " -n 1 -r
|
||
echo
|
||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||
echo "❌ 用户取消"
|
||
exit 1
|
||
fi
|
||
|
||
# 启动训练
|
||
echo ""
|
||
echo "=== 8. 启动训练 ==="
|
||
echo "开始时间: $(date)"
|
||
echo ""
|
||
|
||
torchpack dist-run \
|
||
-np 8 \
|
||
python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_gca.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from "$LATEST_CKPT" \
|
||
--resume-from "$LATEST_CKPT"
|
||
|
||
echo ""
|
||
echo "=== 训练完成 ==="
|
||
echo "结束时间: $(date)"
|
||
|