bev-project/archive/scripts_old/START_PHASE4A_TASK_GCA_BACK...

151 lines
5.3 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 - Task-specific GCA优化版 (数值稳定性修复)
# 日期: 2025-11-13
#
# ✨✨✨ 核心创新: 任务特定GCA ✨✨✨
# - 检测GCA: 从512通道中选择检测最优特征
# - 分割GCA: 从512通道中选择分割最优特征
# - 各取所需,避免折中
#
# 🔧 数值稳定性修复:
# - 学习率: 2.0e-5 → 1.0e-5
# - 梯度裁剪: 35 → 25
# - FP16 loss scale: 添加init_scale=512
set -e
echo "══════════════════════════════════════════════════════════"
echo "Phase 4A Stage 1 - Task-specific GCA优化版 (数值稳定性修复)"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "✨ 架构亮点:"
echo " Decoder Neck → 原始BEV (512通道完整信息)"
echo " ├─ 检测GCA → 检测最优BEV → 检测头 ✅"
echo " └─ 分割GCA → 分割最优BEV → 分割头 ✅"
echo ""
echo " 优势: 每个任务根据自己需求选择特征"
echo " vs Shared GCA: 避免统一选择的折中问题"
echo ""
echo "🔧 数值稳定性修复:"
echo " - 学习率降低至1.0e-5 (从2.0e-5)"
echo " - 梯度裁剪优化至25 (从35)"
echo " - FP16 loss scale init_scale=512"
echo "══════════════════════════════════════════════════════════"
# 环境检查
echo ""
echo "=== 1. 环境配置 (参考Phase 3成功经验) ==="
if [ ! -d "/workspace/bevfusion" ]; then
echo "❌ 错误: /workspace/bevfusion 不存在"
exit 1
fi
# 检查必要的文件和目录
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-74fea435/epoch_2.pth"
if [ ! -f "$LOAD_CKPT" ]; then
echo "❌ 错误: 未找到 $LOAD_CKPT"
echo "请确保昨天的epoch_2.pth文件存在"
exit 1
fi
CONFIG_FILE="/workspace/bevfusion/configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml"
if [ ! -f "$CONFIG_FILE" ]; then
echo "❌ 错误: 未找到配置文件 $CONFIG_FILE"
exit 1
fi
echo "✅ 将从 $LOAD_CKPT 加载权重"
ls -lh "$LOAD_CKPT"
echo "✅ 配置文件: $CONFIG_FILE"
# 创建日志目录
LOG_DIR="/data/runs/phase4a_stage1_task_gca_fixed"
mkdir -p "$LOG_DIR"
echo ""
echo "=== 2. 启动训练 (后台运行,数值稳定性优化) ==="
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export NCCL_IB_DISABLE=1
export NCCL_SOCKET_IFNAME=ib0
echo "✅ 环境变量已设置:"
echo " PATH: /opt/conda/bin:..."
echo " LD_LIBRARY_PATH: 已设置"
echo " PYTHONPATH: /workspace/bevfusion"
echo " PYTORCH_CUDA_ALLOC_CONF: $PYTORCH_CUDA_ALLOC_CONF"
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 记录启动信息
echo "开始时间: $(date)" | tee "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
echo "GPU数量: 8" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
echo "Checkpoint: $LOAD_CKPT" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
echo "配置: $CONFIG_FILE" | tee -a "$LOG_DIR/train_start_$(date +%Y%m%d_%H%M%S).log"
# 启动训练
torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
"$CONFIG_FILE" \
--load_from "$LOAD_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
>> "$LOG_DIR/train_$(date +%Y%m%d_%H%M%S).log" 2>&1 &
# 获取进程ID
TRAIN_PID=$!
echo "训练进程ID: $TRAIN_PID"
# 保存进程ID到文件
echo $TRAIN_PID > "$LOG_DIR/train_pid.txt"
echo ""
echo "=== 3. 监控设置 ==="
echo "✅ 训练已启动 (PID: $TRAIN_PID)"
echo "✅ 日志目录: $LOG_DIR"
echo "✅ 实时日志: tail -f $LOG_DIR/train_*.log"
echo ""
echo "📊 监控命令:"
echo " # 查看训练状态"
echo " ps aux | grep train"
echo " "
echo " # 查看最新日志"
echo " tail -50 $LOG_DIR/train_*.log"
echo " "
echo " # 查看GPU使用情况"
echo " nvidia-smi"
echo " "
echo " # 停止训练"
echo " kill $TRAIN_PID"
echo ""
echo "🎯 预期改善:"
echo " - 消除NaN梯度问题"
echo " - 提高训练稳定性"
echo " - 维持Task-GCA架构优势"
echo ""
echo "══════════════════════════════════════════════════════════"
echo "🚀 Phase 4A Stage 1 - 数值稳定性修复版启动完成!"
echo "══════════════════════════════════════════════════════════"