151 lines
5.3 KiB
Bash
Executable File
151 lines
5.3 KiB
Bash
Executable File
#!/bin/bash
|
||
|
||
# Phase 4A Stage 1 - Task-specific GCA优化版 (数值稳定性修复)
|
||
# 日期: 2025-11-13
|
||
#
|
||
# ✨✨✨ 核心创新: 任务特定GCA ✨✨✨
|
||
# - 检测GCA: 从512通道中选择检测最优特征
|
||
# - 分割GCA: 从512通道中选择分割最优特征
|
||
# - 各取所需,避免折中
|
||
#
|
||
# 🔧 数值稳定性修复:
|
||
# - 学习率: 2.0e-5 → 1.0e-5
|
||
# - 梯度裁剪: 35 → 25
|
||
# - FP16 loss scale: 添加init_scale=512
|
||
|
||
set -e
|
||
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo "Phase 4A Stage 1 - Task-specific GCA优化版 (数值稳定性修复)"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo ""
|
||
echo "✨ 架构亮点:"
|
||
echo " Decoder Neck → 原始BEV (512通道,完整信息)"
|
||
echo " ├─ 检测GCA → 检测最优BEV → 检测头 ✅"
|
||
echo " └─ 分割GCA → 分割最优BEV → 分割头 ✅"
|
||
echo ""
|
||
echo " 优势: 每个任务根据自己需求选择特征"
|
||
echo " vs Shared GCA: 避免统一选择的折中问题"
|
||
echo ""
|
||
echo "🔧 数值稳定性修复:"
|
||
echo " - 学习率降低至1.0e-5 (从2.0e-5)"
|
||
echo " - 梯度裁剪优化至25 (从35)"
|
||
echo " - FP16 loss scale init_scale=512"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
|
||
# 环境检查
|
||
echo ""
|
||
echo "=== 1. 环境配置 (参考Phase 3成功经验) ==="
|
||
if [ ! -d "/workspace/bevfusion" ]; then
|
||
echo "❌ 错误: /workspace/bevfusion 不存在"
|
||
exit 1
|
||
fi
|
||
|
||
# 检查必要的文件和目录
|
||
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-74fea435/epoch_2.pth"
|
||
if [ ! -f "$LOAD_CKPT" ]; then
|
||
echo "❌ 错误: 未找到 $LOAD_CKPT"
|
||
echo "请确保昨天的epoch_2.pth文件存在"
|
||
exit 1
|
||
fi
|
||
|
||
CONFIG_FILE="/workspace/bevfusion/configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml"
|
||
if [ ! -f "$CONFIG_FILE" ]; then
|
||
echo "❌ 错误: 未找到配置文件 $CONFIG_FILE"
|
||
exit 1
|
||
fi
|
||
|
||
echo "✅ 将从 $LOAD_CKPT 加载权重"
|
||
ls -lh "$LOAD_CKPT"
|
||
echo "✅ 配置文件: $CONFIG_FILE"
|
||
|
||
# 创建日志目录
|
||
LOG_DIR="/data/runs/phase4a_stage1_task_gca_fixed"
|
||
mkdir -p "$LOG_DIR"
|
||
|
||
echo ""
|
||
echo "=== 2. 启动训练 (后台运行,数值稳定性优化) ==="
|
||
|
||
# ✅ 关键: 设置环境变量 (参考成功脚本)
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
|
||
# 设置GPU
|
||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||
export NCCL_IB_DISABLE=1
|
||
export NCCL_SOCKET_IFNAME=ib0
|
||
|
||
echo "✅ 环境变量已设置:"
|
||
echo " PATH: /opt/conda/bin:..."
|
||
echo " LD_LIBRARY_PATH: 已设置"
|
||
echo " PYTHONPATH: /workspace/bevfusion"
|
||
echo " PYTORCH_CUDA_ALLOC_CONF: $PYTORCH_CUDA_ALLOC_CONF"
|
||
|
||
# 验证环境
|
||
echo ""
|
||
echo "=== 环境验证 ==="
|
||
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
|
||
echo "❌ PyTorch导入失败"
|
||
exit 1
|
||
}
|
||
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
|
||
echo "❌ mmcv导入失败"
|
||
exit 1
|
||
}
|
||
which torchpack || {
|
||
echo "❌ torchpack未找到"
|
||
exit 1
|
||
}
|
||
echo "✅ torchpack: $(which torchpack)"
|
||
|
||
# 记录启动信息
|
||
echo "开始时间: $(date)" | tee "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
|
||
echo "GPU数量: 8" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
|
||
echo "Checkpoint: $LOAD_CKPT" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
|
||
echo "配置: $CONFIG_FILE" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
# 启动训练
|
||
torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
|
||
"$CONFIG_FILE" \
|
||
--load_from "$LOAD_CKPT" \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
>> "$LOG_DIR/train_$(date +%Y%m%d_%H%M%S).log" 2>&1 &
|
||
|
||
# 获取进程ID
|
||
TRAIN_PID=$!
|
||
echo "训练进程ID: $TRAIN_PID"
|
||
|
||
# 保存进程ID到文件
|
||
echo $TRAIN_PID > "$LOG_DIR/train_pid.txt"
|
||
|
||
echo ""
|
||
echo "=== 3. 监控设置 ==="
|
||
echo "✅ 训练已启动 (PID: $TRAIN_PID)"
|
||
echo "✅ 日志目录: $LOG_DIR"
|
||
echo "✅ 实时日志: tail -f $LOG_DIR/train_*.log"
|
||
echo ""
|
||
echo "📊 监控命令:"
|
||
echo " # 查看训练状态"
|
||
echo " ps aux | grep train"
|
||
echo " "
|
||
echo " # 查看最新日志"
|
||
echo " tail -50 $LOG_DIR/train_*.log"
|
||
echo " "
|
||
echo " # 查看GPU使用情况"
|
||
echo " nvidia-smi"
|
||
echo " "
|
||
echo " # 停止训练"
|
||
echo " kill $TRAIN_PID"
|
||
echo ""
|
||
echo "🎯 预期改善:"
|
||
echo " - 消除NaN梯度问题"
|
||
echo " - 提高训练稳定性"
|
||
echo " - 维持Task-GCA架构优势"
|
||
echo ""
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo "🚀 Phase 4A Stage 1 - 数值稳定性修复版启动完成!"
|
||
echo "══════════════════════════════════════════════════════════"
|