bev-project/archive/scripts_old/START_PHASE4A_SHARED_GCA.sh

141 lines
4.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 - 共享BEV层GCA训练启动脚本
# 日期: 2025-11-06
# 特性:
# 1. ✨ 共享BEV层GCA (检测和分割都受益)
# 2. Validation样本减少50% (6019→3010)
# 3. Evaluation频率降低 (interval=10)
# 4. 从epoch_5继续训练
set -e
echo "=============================================="
echo "Phase 4A Stage 1 - 共享BEV层GCA优化版"
echo "=============================================="
# 环境检查
echo ""
echo "=== 1. 环境检查 ==="
if [ ! -d "/workspace/bevfusion" ]; then
echo "❌ 错误: /workspace/bevfusion 不存在"
exit 1
fi
cd /workspace/bevfusion
# 检查磁盘空间
echo ""
echo "=== 2. 磁盘空间检查 ==="
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
echo "可用空间: ${AVAIL_GB}GB"
if [ "$AVAIL_GB" -lt 30 ]; then
echo "⚠️ 警告: 磁盘空间不足30GB"
echo "建议: 清理.eval_hook缓存"
echo "执行: rm -rf /workspace/bevfusion/runs/*/.eval_hook/"
read -p "是否继续? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
fi
# 检查checkpoint
echo ""
echo "=== 3. Checkpoint检查 ==="
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
if [ ! -f "$LATEST_CKPT" ]; then
echo "❌ 错误: 未找到 epoch_5.pth"
echo "尝试查找最新checkpoint..."
LATEST_CKPT=$(ls -t /workspace/bevfusion/runs/run-326653dc-2334d461/epoch_*.pth 2>/dev/null | head -1)
if [ -z "$LATEST_CKPT" ]; then
echo "❌ 未找到任何checkpoint"
exit 1
fi
fi
echo "✅ 使用checkpoint: $LATEST_CKPT"
# 显示配置摘要
echo ""
echo "=== 4. 训练配置摘要 ==="
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "配置文件: multitask_BEV2X_phase4a_stage1_gca.yaml"
echo "输出目录: /data/runs/phase4a_stage1_gca"
echo "起始epoch: 5"
echo "目标epoch: 20"
echo "剩余epochs: 15"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "学习率: 2.0e-5"
echo "BEV分辨率: 360×360 → 600×600"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "✨ 新特性:"
echo " 1. 共享BEV层GCA (in_channels=512, reduction=4)"
echo " 位置: Decoder Neck之后任务头之前"
echo " 受益: 检测头 ✅ + 分割头 ✅"
echo ""
echo " 2. Validation样本优化"
echo " 原样本: 6,019个"
echo " 新样本: 3,010个 (load_interval=2)"
echo " 减少: 50%"
echo ""
echo " 3. Evaluation频率优化"
echo " 原频率: 每5 epochs"
echo " 新频率: 每10 epochs"
echo " 减少: 50%"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# GPU检查
echo ""
echo "=== 5. GPU状态 ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
# 清理旧的.eval_hook (预防)
echo ""
echo "=== 6. 清理旧缓存 ==="
EVAL_HOOK_COUNT=$(find /workspace/bevfusion/runs -name ".eval_hook" -type d 2>/dev/null | wc -l)
if [ "$EVAL_HOOK_COUNT" -gt 0 ]; then
echo "发现 $EVAL_HOOK_COUNT 个.eval_hook目录正在删除..."
find /workspace/bevfusion/runs -name ".eval_hook" -type d -exec rm -rf {} \; 2>/dev/null || true
echo "✅ 已清理"
else
echo "✅ 无需清理"
fi
# 确认启动
echo ""
echo "=== 7. 准备启动训练 ==="
echo "即将开始训练,剩余 15 epochs (epoch 6-20)"
echo "预计完成时间: ~7天 (FP32)"
echo ""
echo "架构亮点:"
echo " ✨ 共享BEV层GCA - 检测和分割都用全局增强的BEV特征"
echo " 📉 Evaluation开销减少75%"
echo " 🎯 目标: Divider Dice < 0.45, mAP > 0.69"
echo ""
read -p "确认启动? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ 用户取消"
exit 1
fi
# 启动训练
echo ""
echo "=== 8. 启动训练 ==="
echo "开始时间: $(date)"
echo ""
torchpack dist-run \
-np 8 \
python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_gca.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
--load_from "$LATEST_CKPT" \
--resume-from "$LATEST_CKPT"
echo ""
echo "=== 训练完成 ==="
echo "结束时间: $(date)"