#!/bin/bash echo "========================================================================" echo "Phase 4A: BEV 2x训练启动" echo "========================================================================" echo "配置: BEV分辨率0.15m (2倍提升)" echo "Decoder: 4层完整版 [256, 256, 128, 128]" echo "基础模型: epoch_23.pth" echo "目标: 20 epochs" echo "========================================================================" # 1. 检查checkpoint if [ ! -f "runs/enhanced_from_epoch19/epoch_23.pth" ]; then echo "❌ 错误: epoch_23.pth不存在!" exit 1 fi echo "✅ Checkpoint存在: epoch_23.pth" # 2. 检查配置文件 CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml" if [ ! -f "$CONFIG" ]; then echo "❌ 错误: 配置文件不存在!" exit 1 fi echo "✅ 配置文件存在" # 3. 创建运行目录 mkdir -p runs/phase4a_bev2x echo "✅ 运行目录创建" # 4. 检查GPU echo "" echo "GPU状态:" nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader echo "" # 5. 确认启动 echo "========================================================================" echo "即将启动训练,配置信息:" echo " - GPU数量: 6" echo " - Batch size: 1 per GPU" echo " - Workers: 0" echo " - 学习率: 2e-5" echo " - Epochs: 20" echo " - 预计时间: ~12.5天" echo " - 预计显存: ~28-29GB/GPU" echo "========================================================================" echo "" read -p "确认启动? (y/n) " -n 1 -r echo if [[ ! $REPLY =~ ^[Yy]$ ]]; then echo "取消启动" exit 0 fi # 6. 启动训练 echo "" echo "🚀 启动训练..." echo "" nohup torchpack dist-run -np 6 python tools/train.py \ configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml \ --model.encoders.camera.backbone.init_cfg.checkpoint=pretrained/swint-nuimages-pretrained.pth \ --load_from runs/enhanced_from_epoch19/epoch_23.pth \ --run-dir runs/phase4a_bev2x \ > phase4a_bev2x.log 2>&1 & TRAIN_PID=$! echo "✅ 训练已启动" echo "" echo "========================================================================" echo "训练信息:" echo " PID: $TRAIN_PID" echo " 日志: phase4a_bev2x.log" echo " 输出: runs/phase4a_bev2x/" echo "" echo "监控命令:" echo " 实时日志: tail -f phase4a_bev2x.log" echo " GPU状态: watch -n 60 nvidia-smi" echo " 训练进度: watch -n 300 'tail -30 phase4a_bev2x.log | grep Epoch'" echo "" echo "检查命令:" echo " 检查进程: ps aux | grep $TRAIN_PID" echo " 检查checkpoint: ls -lh runs/phase4a_bev2x/*.pth" echo "========================================================================" echo "" echo "Phase 4A训练已开始,预计2025年11月11日完成" echo ""