112 lines
3.0 KiB
Bash
Executable File
112 lines
3.0 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Phase 4A Stage 1 训练启动脚本 (集成GCA模块 + 优化evaluation)
|
||
# 日期: 2025-11-06
|
||
# 特性:
|
||
# 1. GCA全局上下文模块集成到分割头
|
||
# 2. Validation样本减少50% (6019→3010)
|
||
# 3. Evaluation频率降低 (interval: 5→10)
|
||
# 4. 从epoch_5继续训练
|
||
|
||
set -e
|
||
|
||
echo "=============================================="
|
||
echo "Phase 4A Stage 1 训练 (GCA优化版)"
|
||
echo "=============================================="
|
||
|
||
# 环境检查
|
||
echo ""
|
||
echo "=== 1. 环境检查 ==="
|
||
if [ ! -d "/workspace/bevfusion" ]; then
|
||
echo "❌ 错误: /workspace/bevfusion 不存在"
|
||
exit 1
|
||
fi
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
# 检查磁盘空间
|
||
echo ""
|
||
echo "=== 2. 磁盘空间检查 ==="
|
||
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
|
||
echo "可用空间: ${AVAIL_GB}GB"
|
||
|
||
if [ "$AVAIL_GB" -lt 30 ]; then
|
||
echo "⚠️ 警告: 磁盘空间不足30GB,建议清理"
|
||
echo "当前可用: ${AVAIL_GB}GB"
|
||
fi
|
||
|
||
# 检查checkpoint
|
||
echo ""
|
||
echo "=== 3. Checkpoint检查 ==="
|
||
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
|
||
if [ ! -f "$LATEST_CKPT" ]; then
|
||
echo "❌ 错误: 未找到 epoch_5.pth"
|
||
echo "尝试查找最新checkpoint..."
|
||
LATEST_CKPT=$(ls -t /workspace/bevfusion/runs/run-326653dc-2334d461/epoch_*.pth 2>/dev/null | head -1)
|
||
if [ -z "$LATEST_CKPT" ]; then
|
||
echo "❌ 未找到任何checkpoint"
|
||
exit 1
|
||
fi
|
||
fi
|
||
echo "✅ 使用checkpoint: $LATEST_CKPT"
|
||
|
||
# 显示配置摘要
|
||
echo ""
|
||
echo "=== 4. 训练配置摘要 ==="
|
||
echo "配置文件: multitask_BEV2X_phase4a_stage1.yaml"
|
||
echo "起始epoch: 5"
|
||
echo "目标epoch: 20"
|
||
echo "学习率: 2.0e-5"
|
||
echo "BEV分辨率: 600×600"
|
||
echo "Validation样本: 3,010 (减少50%)"
|
||
echo "Evaluation频率: 每10 epochs"
|
||
echo "新增特性: GCA全局上下文模块 ✨"
|
||
|
||
# GPU检查
|
||
echo ""
|
||
echo "=== 5. GPU状态 ==="
|
||
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
|
||
|
||
# 清理旧的.eval_hook (预防)
|
||
echo ""
|
||
echo "=== 6. 清理旧缓存 ==="
|
||
if [ -d "/workspace/bevfusion/runs/run-326653dc-2334d461/.eval_hook" ]; then
|
||
echo "发现旧的.eval_hook,正在删除..."
|
||
rm -rf /workspace/bevfusion/runs/run-326653dc-2334d461/.eval_hook/
|
||
echo "✅ 已清理"
|
||
else
|
||
echo "✅ 无需清理"
|
||
fi
|
||
|
||
# 确认启动
|
||
echo ""
|
||
echo "=== 7. 准备启动训练 ==="
|
||
echo "即将开始训练,剩余 15 epochs (epoch 6-20)"
|
||
echo "预计完成时间: ~7天 (FP32)"
|
||
echo ""
|
||
read -p "确认启动? (y/n) " -n 1 -r
|
||
echo
|
||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||
echo "❌ 用户取消"
|
||
exit 1
|
||
fi
|
||
|
||
# 启动训练
|
||
echo ""
|
||
echo "=== 8. 启动训练 ==="
|
||
echo "开始时间: $(date)"
|
||
echo ""
|
||
|
||
torchpack dist-run \
|
||
-np 8 \
|
||
python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from "$LATEST_CKPT" \
|
||
--resume-from "$LATEST_CKPT"
|
||
|
||
echo ""
|
||
echo "=== 训练完成 ==="
|
||
echo "结束时间: $(date)"
|
||
|