bev-project/scripts/training/START_PHASE4A_TASK_GCA.sh

202 lines
7.3 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 - Task-specific GCA训练启动脚本
# 日期: 2025-11-06
#
# ✨✨✨ 核心创新: 任务特定GCA ✨✨✨
# - 检测GCA: 从512通道中选择检测最优特征
# - 分割GCA: 从512通道中选择分割最优特征
# - 各取所需,避免折中
set -e
echo "══════════════════════════════════════════════════════════"
echo "Phase 4A Stage 1 - Task-specific GCA优化版"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "✨ 架构亮点:"
echo " Decoder Neck → 原始BEV (512通道完整信息)"
echo " ├─ 检测GCA → 检测最优BEV → 检测头 ✅"
echo " └─ 分割GCA → 分割最优BEV → 分割头 ✅"
echo ""
echo " 优势: 每个任务根据自己需求选择特征"
echo " vs Shared GCA: 避免统一选择的折中问题"
echo "══════════════════════════════════════════════════════════"
# 环境检查
echo ""
echo "=== 1. 环境配置 (参考Phase 3成功经验) ==="
if [ ! -d "/workspace/bevfusion" ]; then
echo "❌ 错误: /workspace/bevfusion 不存在"
exit 1
fi
cd /workspace/bevfusion
# ✅ 关键: 设置环境变量 (Phase 3验证成功)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
echo "✅ 环境变量已设置:"
echo " PATH: /opt/conda/bin:..."
echo " LD_LIBRARY_PATH: 已设置"
echo " PYTHONPATH: /workspace/bevfusion"
# 验证环境
echo ""
echo "=== 2. 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 检查磁盘空间
echo ""
echo "=== 2. 磁盘空间检查 ==="
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
echo "可用空间: ${AVAIL_GB}GB"
if [ "$AVAIL_GB" -lt 30 ]; then
echo "⚠️ 警告: 磁盘空间不足30GB"
read -p "是否继续? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
exit 1
fi
fi
# 检查checkpoint
echo ""
echo "=== 3. Checkpoint检查 ==="
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
if [ ! -f "$LATEST_CKPT" ]; then
echo "❌ 错误: 未找到 epoch_5.pth"
exit 1
fi
echo "✅ 使用checkpoint: $LATEST_CKPT"
ls -lh "$LATEST_CKPT"
# 显示配置摘要
echo ""
echo "=== 4. Task-specific GCA配置摘要 ==="
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
echo "输出目录: /data/runs/phase4a_stage1_task_gca"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "起始epoch: 5"
echo "目标epoch: 20"
echo "剩余epochs: 15"
echo ""
echo "✨ Task-specific GCA配置:"
echo " 检测GCA (object):"
echo " - in_channels: 512"
echo " - reduction: 4"
echo " - 作用: 选择对检测最优的特征"
echo " - 强化: 物体边界、中心点、空间关系"
echo ""
echo " 分割GCA (map):"
echo " - in_channels: 512"
echo " - reduction: 4"
echo " - 作用: 选择对分割最优的特征"
echo " - 强化: 语义纹理、连续性、全局语义"
echo ""
echo " 总参数增加: 262,144 (0.26M)"
echo " 计算增加: ~1.6ms (0.06%)"
echo ""
echo "📊 Evaluation优化:"
echo " - Validation样本: 3,010 (减少50%)"
echo " - Evaluation频率: 每10 epochs"
echo " - 总开销: 减少75%"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# GPU检查
echo ""
echo "=== 5. GPU状态 ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
# 清理旧缓存
echo ""
echo "=== 6. 清理旧缓存 ==="
EVAL_HOOK_COUNT=$(find /workspace/bevfusion/runs -name ".eval_hook" -type d 2>/dev/null | wc -l)
if [ "$EVAL_HOOK_COUNT" -gt 0 ]; then
echo "发现 $EVAL_HOOK_COUNT 个.eval_hook目录正在删除..."
find /workspace/bevfusion/runs -name ".eval_hook" -type d -exec rm -rf {} \; 2>/dev/null || true
echo "✅ 已清理"
else
echo "✅ 无需清理"
fi
# 确认启动
echo ""
echo "=== 7. 准备启动训练 ==="
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "训练周期: Epoch 6-20 (15 epochs)"
echo "预计时间: ~7天 (FP32)"
echo "预计完成: 2025-11-13"
echo ""
echo "预期性能提升:"
echo " 检测: mAP 0.68 → 0.70 (+2.9%)"
echo " 分割: mIoU 0.55 → 0.61 (+10%)"
echo " Divider: Dice 0.52 → 0.42 (-19%)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
read -p "确认启动 Task-specific GCA训练? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ 用户取消"
exit 1
fi
# 启动训练
echo ""
echo "=== 8. 启动训练 ==="
echo "开始时间: $(date)"
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
echo "Checkpoint: $LATEST_CKPT"
echo ""
# ✅ 部分加载策略 (只加载匹配的权重新增的task_gca随机初始化)
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "加载策略:"
echo " ✅ 从epoch_5.pth加载已有权重:"
echo " - encoders.camera (Swin Transformer骨干)"
echo " - encoders.lidar (Sparse Encoder)"
echo " - fuser (ConvFuser)"
echo " - decoder.backbone + neck"
echo " - heads.object (TransFusion检测头)"
echo " - heads.map (EnhancedBEVSeg分割头)"
echo ""
echo " ✨ 新增模块随机初始化:"
echo " - task_gca['object'] (检测GCA131K参数)"
echo " - task_gca['map'] (分割GCA131K参数)"
echo ""
echo " 📝 使用--load_from (非--resume-from):"
echo " - 只加载模型权重,忽略不匹配的键"
echo " - 训练从epoch 6开始 (继续之前的进度)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
torchpack dist-run \
-np 8 \
/opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml \
--load_from "$LATEST_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0
echo ""
echo "=== 训练完成 ==="
echo "结束时间: $(date)"