#!/bin/bash # Phase 4A: BEV 2x训练 - 完整环境设置 set -e # 设置环境变量 export PATH=/opt/conda/bin:$PATH export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH cd /workspace/bevfusion echo "========================================================================" echo "Phase 4A: BEV 2x分辨率提升训练" echo "========================================================================" echo "配置: BEV 0.15m分辨率 (2倍提升)" echo "Decoder: 4层完整版 [256, 256, 128, 128]" echo "基础模型: epoch_23.pth (NDS 0.6941, mAP 0.6446)" echo "目标: 20 epochs" echo "========================================================================" echo "" # 检查checkpoint if [ ! -f "runs/enhanced_from_epoch19/epoch_23.pth" ]; then echo "❌ 错误: epoch_23.pth不存在!" exit 1 fi echo "✅ Checkpoint存在" # 检查配置文件 CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml" if [ ! -f "$CONFIG" ]; then echo "❌ 错误: 配置文件不存在!" exit 1 fi echo "✅ 配置文件存在" # 创建输出目录 mkdir -p runs/phase4a_bev2x echo "✅ 输出目录创建" echo "" echo "训练配置:" echo " - GPU数量: 6" echo " - Batch size: 1 per GPU" echo " - Workers: 0" echo " - 学习率: 2e-5" echo " - Epochs: 20" echo " - 预计时间: 12.5天" echo " - 预计显存: ~28-29GB/GPU" echo "" # 生成日志文件名 LOG_FILE="phase4a_bev2x.log" echo "启动训练..." echo "日志文件: $LOG_FILE" echo "" # 启动训练 nohup torchpack dist-run -np 6 python tools/train.py \ $CONFIG \ --model.encoders.camera.backbone.init_cfg.checkpoint=pretrained/swint-nuimages-pretrained.pth \ --load_from runs/enhanced_from_epoch19/epoch_23.pth \ --run-dir runs/phase4a_bev2x \ > $LOG_FILE 2>&1 & TRAIN_PID=$! echo "✅ 训练已启动" echo "" echo "========================================================================" echo "训练信息:" echo " PID: $TRAIN_PID" echo " 日志: $LOG_FILE" echo " 输出: runs/phase4a_bev2x/" echo "" echo "监控命令:" echo " 实时日志: tail -f $LOG_FILE" echo " 训练进度: bash monitor_phase4a.sh" echo " GPU状态: nvidia-smi" echo "" echo "预计完成: 2025年11月12日" echo "========================================================================" echo "" # 等待3秒后检查 sleep 3 if ps -p $TRAIN_PID > /dev/null; then echo "✅ 训练进程运行中 (PID: $TRAIN_PID)" else echo "⚠️ 训练进程未运行,请检查日志" tail -20 $LOG_FILE fi