bev-project/archive/scripts_old/START_PHASE4A_TASK_GCA_BACK...

244 lines
9.0 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 - Task-specific GCA训练启动脚本 (后台运行版本)
# 日期: 2025-11-06
#
# ✨✨✨ 核心创新: 任务特定GCA ✨✨✨
# - 检测GCA: 从512通道中选择检测最优特征
# - 分割GCA: 从512通道中选择分割最优特征
# - 各取所需,避免折中
set -e
echo "══════════════════════════════════════════════════════════"
echo "Phase 4A Stage 1 - Task-specific GCA优化版 (后台运行)"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "✨ 架构亮点:"
echo " Decoder Neck → 原始BEV (512通道完整信息)"
echo " ├─ 检测GCA → 检测最优BEV → 检测头 ✅"
echo " └─ 分割GCA → 分割最优BEV → 分割头 ✅"
echo ""
echo " 优势: 每个任务根据自己需求选择特征"
echo " vs Shared GCA: 避免统一选择的折中问题"
echo "══════════════════════════════════════════════════════════"
# 环境检查
echo ""
echo "=== 1. 环境配置 (参考Phase 3成功经验) ==="
if [ ! -d "/workspace/bevfusion" ]; then
echo "❌ 错误: /workspace/bevfusion 不存在"
exit 1
fi
cd /workspace/bevfusion
# ✅ 关键: 设置环境变量 (Phase 3验证成功)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
echo "✅ 环境变量已设置:"
echo " PATH: /opt/conda/bin:..."
echo " LD_LIBRARY_PATH: 已设置"
echo " PYTHONPATH: /workspace/bevfusion"
echo " PYTORCH_CUDA_ALLOC_CONF: $PYTORCH_CUDA_ALLOC_CONF"
# 验证环境
echo ""
echo "=== 2. 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 检查磁盘空间
echo ""
echo "=== 3. 磁盘空间检查 ==="
AVAIL_GB=$(df /workspace | tail -1 | awk '{print int($4/1024/1024)}')
echo "可用空间: ${AVAIL_GB}GB"
if [ "$AVAIL_GB" -lt 30 ]; then
echo "⚠️ 警告: 磁盘空间不足30GB"
echo "建议清理后再继续"
fi
# 检查checkpoint
echo ""
echo "=== 4. Checkpoint检查 ==="
LATEST_CKPT="/workspace/bevfusion/runs/run-326653dc-2334d461/epoch_5.pth"
if [ ! -f "$LATEST_CKPT" ]; then
echo "❌ 错误: 未找到 epoch_5.pth"
exit 1
fi
echo "✅ 使用checkpoint: $LATEST_CKPT"
ls -lh "$LATEST_CKPT"
echo ""
echo "=== 5. Resume Checkpoint ==="
LOAD_CKPT="/workspace/bevfusion/runs/run-326653dc-b7d0a4a4/epoch_8.pth"
if [ ! -f "$LOAD_CKPT" ]; then
echo "❌ 错误: 未找到 $LOAD_CKPT"
exit 1
fi
echo "✅ 将从 $LOAD_CKPT 加载权重"
ls -lh "$LOAD_CKPT"
# 显示配置摘要
echo ""
echo "=== 6. Task-specific GCA配置摘要 ==="
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
echo "输出目录: /data/runs/phase4a_stage1_task_gca"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "起始epoch: 5"
echo "目标epoch: 20"
echo "剩余epochs: 15"
echo ""
echo "✨ Task-specific GCA配置:"
echo " 检测GCA (object):"
echo " - in_channels: 512"
echo " - reduction: 4"
echo " - 作用: 选择对检测最优的特征"
echo " - 强化: 物体边界、中心点、空间关系"
echo ""
echo " 分割GCA (map):"
echo " - in_channels: 512"
echo " - reduction: 4"
echo " - 作用: 选择对分割最优的特征"
echo " - 强化: 语义纹理、连续性、全局语义"
echo ""
echo " 总参数增加: 262,144 (0.26M)"
echo " 计算增加: ~1.6ms (0.06%)"
echo ""
echo "📊 Evaluation优化:"
echo " - Validation样本: 3,010 (减少50%)"
echo " - Evaluation频率: 每10 epochs"
echo " - 总开销: 减少75%"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# GPU检查
echo ""
echo "=== 7. GPU状态 ==="
nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv,noheader
# 清理旧缓存
echo ""
echo "=== 8. 清理旧缓存 ==="
EVAL_HOOK_COUNT=$(find /workspace/bevfusion/runs -name ".eval_hook" -type d 2>/dev/null | wc -l)
if [ "$EVAL_HOOK_COUNT" -gt 0 ]; then
echo "发现 $EVAL_HOOK_COUNT 个.eval_hook目录正在删除..."
find /workspace/bevfusion/runs -name ".eval_hook" -type d -exec rm -rf {} \; 2>/dev/null || true
echo "✅ 已清理"
else
echo "✅ 无需清理"
fi
# 准备日志目录
echo ""
echo "=== 9. 准备日志目录 ==="
LOG_DIR="/data/runs/phase4a_stage1_task_gca"
mkdir -p "$LOG_DIR"
LOG_FILE="${LOG_DIR}/train_$(date +%Y%m%d_%H%M%S).log"
echo "日志文件: $LOG_FILE"
# 启动训练
echo ""
echo "=== 10. 启动后台训练 ==="
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "开始时间: $(date)"
echo "配置文件: multitask_BEV2X_phase4a_stage1_task_gca.yaml"
echo "Checkpoint: $LOAD_CKPT"
echo "日志文件: $LOG_FILE"
echo ""
echo "训练周期: Epoch 6-20 (15 epochs)"
echo "预计时间: ~7天 (FP32)"
echo "预计完成: 2025-11-13"
echo ""
echo "预期性能提升:"
echo " 检测: mAP 0.68 → 0.70 (+2.9%)"
echo " 分割: mIoU 0.55 → 0.61 (+10%)"
echo " Divider: Dice Loss 0.52 → 0.42 (-19%)"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
# ✅ 使用nohup后台运行
nohup torchpack dist-run \
-np 8 \
/opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_task_gca.yaml \
--load_from "$LOAD_CKPT" \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
> "$LOG_FILE" 2>&1 &
# 获取进程ID
TRAIN_PID=$!
echo ""
echo "══════════════════════════════════════════════════════════"
echo "✅ 训练已在后台启动!"
echo "══════════════════════════════════════════════════════════"
echo ""
echo "进程ID: $TRAIN_PID"
echo "日志文件: $LOG_FILE"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "监控命令:"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "1. 查看实时日志:"
echo " tail -f $LOG_FILE"
echo ""
echo "2. 查看关键指标:"
echo " tail -f $LOG_FILE | grep -E 'Epoch|loss/map/divider|loss/object'"
echo ""
echo "3. 检查进程状态:"
echo " ps aux | grep $TRAIN_PID"
echo ""
echo "4. GPU监控:"
echo " nvidia-smi -l 5"
echo ""
echo "5. 停止训练:"
echo " kill $TRAIN_PID"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "输出位置:"
echo " Checkpoints: /data/runs/phase4a_stage1_task_gca/epoch_*.pth"
echo " 日志: $LOG_FILE"
echo ""
echo "══════════════════════════════════════════════════════════"
echo ""
# 等待几秒确保训练启动
sleep 5
# 检查进程是否还在运行
if ps -p $TRAIN_PID > /dev/null 2>&1; then
echo "✅ 训练进程正在运行 (PID: $TRAIN_PID)"
echo ""
echo "最新日志 (前50行):"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
head -n 50 "$LOG_FILE" 2>/dev/null || echo "日志文件尚未生成,请稍候..."
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
else
echo "❌ 训练进程可能已退出,请查看日志:"
echo " cat $LOG_FILE"
fi
echo ""
echo "🎉 后台训练已启动!请使用上述监控命令查看进度。"