244 lines
9.0 KiB
Bash
244 lines
9.0 KiB
Bash
#!/bin/bash
|
||
|
||
# Phase 4A Stage 1 - Task-specific GCA训练启动脚本 (后台运行版本)
|
||
# 日期: 2025-11-06
|
||
#
|
||
# ✨✨✨ 核心创新: 任务特定GCA ✨✨✨
|
||
# - 检测GCA: 从512通道中选择检测最优特征
|
||
# - 分割GCA: 从512通道中选择分割最优特征
|
||
# - 各取所需,避免折中
|
||
|
||
set -e
|
||
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo "Phase 4A Stage 1 - Task-specific GCA优化版 (后台运行)"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo ""
|
||
echo "✨ 架构亮点:"
|
||
echo " Decoder Neck → 原始BEV (512通道,完整信息)"
|
||
echo " ├─ 检测GCA → 检测最优BEV → 检测头 ✅"
|
||
echo " └─ 分割GCA → 分割最优BEV → 分割头 ✅"
|
||
echo ""
|
||
echo " 优势: 每个任务根据自己需求选择特征"
|
||
echo " vs Shared GCA: 避免统一选择的折中问题"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
|
||
# 环境检查
|
||
echo ""
|
||
echo "=== 1. 环境配置 (参考Phase 3成功经验) ==="
|
||
if [ ! -d "/workspace/bevfusion" ]; then
|
||
echo "❌ 错误: /workspace/bevfusion 不存在"
|
||
exit 1
|
||
fi
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
# ✅ 关键: 设置环境变量 (Phase 3验证成功)
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
|
||
|
||
echo "✅ 环境变量已设置:"
|
||
echo " PATH: /opt/conda/bin:..."
|
||
echo " LD_LIBRARY_PATH: 已设置"
|
||
echo " PYTHONPATH: /workspace/bevfusion"
|
||
echo " PYTORCH_CUDA_ALLOC_CONF: $PYTORCH_CUDA_ALLOC_CONF"
|
||
|
||
# 验证环境
|
||
echo ""
|
||
echo "=== 2. 环境验证 ==="
|
||
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
|
||
echo "❌ PyTorch导入失败"
|
||
exit 1
|
||
}
|
||
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
|
||
echo "❌ mmcv导入失败"
|
||
exit 1
|
||
}
|
||
which torchpack || {
|
||
echo "❌ torchpack未找到"
|
||
exit 1
|
||
}
|
||
echo "✅ torchpack: $(which torchpack)"
|
||
|
||
# 检查磁盘空间
|
||
echo ""
|
||
echo "=== 3. 磁盘空间检查 ==="
|
||
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
|
||
echo "可用空间: ${AVAIL_GB}GB"
|
||
|
||
if [ "$AVAIL_GB" -lt 30 ]; then
|
||
echo "⚠️ 警告: 磁盘空间不足30GB"
|
||
echo "建议清理后再继续"
|
||
fi
|
||
|
||
# 检查checkpoint
|
||
echo ""
|
||
echo "=== 4. Checkpoint检查 ==="
|
||
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
|
||
if [ ! -f "$LATEST_CKPT" ]; then
|
||
echo "❌ 错误: 未找到 epoch_5.pth"
|
||
exit 1
|
||
fi
|
||
echo "✅ 使用checkpoint: $LATEST_CKPT"
|
||
ls -lh "$LATEST_CKPT"
|
||
|
||
echo ""
|
||
echo "=== 5. Resume Checkpoint ==="
|
||
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-b7d0a4a4/epoch_8.pth"
|
||
if [ ! -f "$LOAD_CKPT" ]; then
|
||
echo "❌ 错误: 未找到 $LOAD_CKPT"
|
||
exit 1
|
||
fi
|
||
echo "✅ 将从 $LOAD_CKPT 加载权重"
|
||
ls -lh "$LOAD_CKPT"
|
||
|
||
# 显示配置摘要
|
||
echo ""
|
||
echo "=== 6. Task-specific GCA配置摘要 ==="
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
|
||
echo "输出目录: /data/runs/phase4a_stage1_task_gca"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo ""
|
||
echo "起始epoch: 5"
|
||
echo "目标epoch: 20"
|
||
echo "剩余epochs: 15"
|
||
echo ""
|
||
echo "✨ Task-specific GCA配置:"
|
||
echo " 检测GCA (object):"
|
||
echo " - in_channels: 512"
|
||
echo " - reduction: 4"
|
||
echo " - 作用: 选择对检测最优的特征"
|
||
echo " - 强化: 物体边界、中心点、空间关系"
|
||
echo ""
|
||
echo " 分割GCA (map):"
|
||
echo " - in_channels: 512"
|
||
echo " - reduction: 4"
|
||
echo " - 作用: 选择对分割最优的特征"
|
||
echo " - 强化: 语义纹理、连续性、全局语义"
|
||
echo ""
|
||
echo " 总参数增加: 262,144 (0.26M)"
|
||
echo " 计算增加: ~1.6ms (0.06%)"
|
||
echo ""
|
||
echo "📊 Evaluation优化:"
|
||
echo " - Validation样本: 3,010 (减少50%)"
|
||
echo " - Evaluation频率: 每10 epochs"
|
||
echo " - 总开销: 减少75%"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
|
||
# GPU检查
|
||
echo ""
|
||
echo "=== 7. GPU状态 ==="
|
||
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
|
||
|
||
# 清理旧缓存
|
||
echo ""
|
||
echo "=== 8. 清理旧缓存 ==="
|
||
EVAL_HOOK_COUNT=$(find /workspace/bevfusion/runs -name ".eval_hook" -type d 2>/dev/null | wc -l)
|
||
if [ "$EVAL_HOOK_COUNT" -gt 0 ]; then
|
||
echo "发现 $EVAL_HOOK_COUNT 个.eval_hook目录,正在删除..."
|
||
find /workspace/bevfusion/runs -name ".eval_hook" -type d -exec rm -rf {} \; 2>/dev/null || true
|
||
echo "✅ 已清理"
|
||
else
|
||
echo "✅ 无需清理"
|
||
fi
|
||
|
||
# 准备日志目录
|
||
echo ""
|
||
echo "=== 9. 准备日志目录 ==="
|
||
LOG_DIR="/data/runs/phase4a_stage1_task_gca"
|
||
mkdir -p "$LOG_DIR"
|
||
LOG_FILE="${LOG_DIR}/train_$(date +%Y%m%d_%H%M%S).log"
|
||
echo "日志文件: $LOG_FILE"
|
||
|
||
# 启动训练
|
||
echo ""
|
||
echo "=== 10. 启动后台训练 ==="
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "开始时间: $(date)"
|
||
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
|
||
echo "Checkpoint: $LOAD_CKPT"
|
||
echo "日志文件: $LOG_FILE"
|
||
echo ""
|
||
echo "训练周期: Epoch 6-20 (15 epochs)"
|
||
echo "预计时间: ~7天 (FP32)"
|
||
echo "预计完成: 2025-11-13"
|
||
echo ""
|
||
echo "预期性能提升:"
|
||
echo " 检测: mAP 0.68 → 0.70 (+2.9%)"
|
||
echo " 分割: mIoU 0.55 → 0.61 (+10%)"
|
||
echo " Divider: Dice Loss 0.52 → 0.42 (-19%)"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo ""
|
||
|
||
# ✅ 使用nohup后台运行
|
||
nohup torchpack dist-run \
|
||
-np 8 \
|
||
/opt/conda/bin/python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml \
|
||
--load_from "$LOAD_CKPT" \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
> "$LOG_FILE" 2>&1 &
|
||
|
||
# 获取进程ID
|
||
TRAIN_PID=$!
|
||
|
||
echo ""
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo "✅ 训练已在后台启动!"
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo ""
|
||
echo "进程ID: $TRAIN_PID"
|
||
echo "日志文件: $LOG_FILE"
|
||
echo ""
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo "监控命令:"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo ""
|
||
echo "1. 查看实时日志:"
|
||
echo " tail -f $LOG_FILE"
|
||
echo ""
|
||
echo "2. 查看关键指标:"
|
||
echo " tail -f $LOG_FILE | grep -E 'Epoch|loss/map/divider|loss/object'"
|
||
echo ""
|
||
echo "3. 检查进程状态:"
|
||
echo " ps aux | grep $TRAIN_PID"
|
||
echo ""
|
||
echo "4. GPU监控:"
|
||
echo " nvidia-smi -l 5"
|
||
echo ""
|
||
echo "5. 停止训练:"
|
||
echo " kill $TRAIN_PID"
|
||
echo ""
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
echo ""
|
||
echo "输出位置:"
|
||
echo " Checkpoints: /data/runs/phase4a_stage1_task_gca/epoch_*.pth"
|
||
echo " 日志: $LOG_FILE"
|
||
echo ""
|
||
echo "══════════════════════════════════════════════════════════"
|
||
echo ""
|
||
|
||
# 等待几秒确保训练启动
|
||
sleep 5
|
||
|
||
# 检查进程是否还在运行
|
||
if ps -p $TRAIN_PID > /dev/null 2>&1; then
|
||
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
|
||
echo ""
|
||
echo "最新日志 (前50行):"
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
head -n 50 "$LOG_FILE" 2>/dev/null || echo "日志文件尚未生成,请稍候..."
|
||
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
||
else
|
||
echo "❌ 训练进程可能已退出,请查看日志:"
|
||
echo " cat $LOG_FILE"
|
||
fi
|
||
|
||
echo ""
|
||
echo "🎉 后台训练已启动!请使用上述监控命令查看进度。"
|
||
|