bev-project/archive/scripts_old/START_PHASE4A_WITH_GCA.sh

112 lines
3.0 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 训练启动脚本 (集成GCA模块 + 优化evaluation)
# 日期: 2025-11-06
# 特性:
# 1. GCA全局上下文模块集成到分割头
# 2. Validation样本减少50% (6019→3010)
# 3. Evaluation频率降低 (interval: 5→10)
# 4. 从epoch_5继续训练
set -e
echo "=============================================="
echo "Phase 4A Stage 1 训练 (GCA优化版)"
echo "=============================================="
# 环境检查
echo ""
echo "=== 1. 环境检查 ==="
if [ ! -d "/workspace/bevfusion" ]; then
echo "❌ 错误: /workspace/bevfusion 不存在"
exit 1
fi
cd /workspace/bevfusion
# 检查磁盘空间
echo ""
echo "=== 2. 磁盘空间检查 ==="
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
echo "可用空间: ${AVAIL_GB}GB"
if [ "$AVAIL_GB" -lt 30 ]; then
echo "⚠️ 警告: 磁盘空间不足30GB建议清理"
echo "当前可用: ${AVAIL_GB}GB"
fi
# 检查checkpoint
echo ""
echo "=== 3. Checkpoint检查 ==="
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
if [ ! -f "$LATEST_CKPT" ]; then
echo "❌ 错误: 未找到 epoch_5.pth"
echo "尝试查找最新checkpoint..."
LATEST_CKPT=$(ls -t /workspace/bevfusion/runs/run-326653dc-2334d461/epoch_*.pth 2>/dev/null | head -1)
if [ -z "$LATEST_CKPT" ]; then
echo "❌ 未找到任何checkpoint"
exit 1
fi
fi
echo "✅ 使用checkpoint: $LATEST_CKPT"
# 显示配置摘要
echo ""
echo "=== 4. 训练配置摘要 ==="
echo "配置文件: multitask_BEV2X_phase4a_stage1.yaml"
echo "起始epoch: 5"
echo "目标epoch: 20"
echo "学习率: 2.0e-5"
echo "BEV分辨率: 600×600"
echo "Validation样本: 3,010 (减少50%)"
echo "Evaluation频率: 每10 epochs"
echo "新增特性: GCA全局上下文模块 ✨"
# GPU检查
echo ""
echo "=== 5. GPU状态 ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
# 清理旧的.eval_hook (预防)
echo ""
echo "=== 6. 清理旧缓存 ==="
if [ -d "/workspace/bevfusion/runs/run-326653dc-2334d461/.eval_hook" ]; then
echo "发现旧的.eval_hook正在删除..."
rm -rf /workspace/bevfusion/runs/run-326653dc-2334d461/.eval_hook/
echo "✅ 已清理"
else
echo "✅ 无需清理"
fi
# 确认启动
echo ""
echo "=== 7. 准备启动训练 ==="
echo "即将开始训练,剩余 15 epochs (epoch 6-20)"
echo "预计完成时间: ~7天 (FP32)"
echo ""
read -p "确认启动? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "❌ 用户取消"
exit 1
fi
# 启动训练
echo ""
echo "=== 8. 启动训练 ==="
echo "开始时间: $(date)"
echo ""
torchpack dist-run \
-np 8 \
python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
--load_from "$LATEST_CKPT" \
--resume-from "$LATEST_CKPT"
echo ""
echo "=== 训练完成 ==="
echo "结束时间: $(date)"