99 lines
2.6 KiB
Bash
Executable File
99 lines
2.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# Phase 4A: BEV 2x训练 - 完整环境设置
|
|
|
|
set -e
|
|
|
|
# 设置环境变量
|
|
export PATH=/opt/conda/bin:$PATH
|
|
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
|
|
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
|
|
|
cd /workspace/bevfusion
|
|
|
|
echo "========================================================================"
|
|
echo "Phase 4A: BEV 2x分辨率提升训练"
|
|
echo "========================================================================"
|
|
echo "配置: BEV 0.15m分辨率 (2倍提升)"
|
|
echo "Decoder: 4层完整版 [256, 256, 128, 128]"
|
|
echo "基础模型: epoch_23.pth (NDS 0.6941, mAP 0.6446)"
|
|
echo "目标: 20 epochs"
|
|
echo "========================================================================"
|
|
echo ""
|
|
|
|
# 检查checkpoint
|
|
if [ ! -f "runs/enhanced_from_epoch19/epoch_23.pth" ]; then
|
|
echo "❌ 错误: epoch_23.pth不存在!"
|
|
exit 1
|
|
fi
|
|
echo "✅ Checkpoint存在"
|
|
|
|
# 检查配置文件
|
|
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml"
|
|
if [ ! -f "$CONFIG" ]; then
|
|
echo "❌ 错误: 配置文件不存在!"
|
|
exit 1
|
|
fi
|
|
echo "✅ 配置文件存在"
|
|
|
|
# 创建输出目录
|
|
mkdir -p runs/phase4a_bev2x
|
|
echo "✅ 输出目录创建"
|
|
|
|
echo ""
|
|
echo "训练配置:"
|
|
echo " - GPU数量: 6"
|
|
echo " - Batch size: 1 per GPU"
|
|
echo " - Workers: 0"
|
|
echo " - 学习率: 2e-5"
|
|
echo " - Epochs: 20"
|
|
echo " - 预计时间: 12.5天"
|
|
echo " - 预计显存: ~28-29GB/GPU"
|
|
echo ""
|
|
|
|
# 生成日志文件名
|
|
LOG_FILE="phase4a_bev2x.log"
|
|
|
|
echo "启动训练..."
|
|
echo "日志文件: $LOG_FILE"
|
|
echo ""
|
|
|
|
# 启动训练
|
|
nohup torchpack dist-run -np 6 python tools/train.py \
|
|
$CONFIG \
|
|
--model.encoders.camera.backbone.init_cfg.checkpoint=pretrained/swint-nuimages-pretrained.pth \
|
|
--load_from runs/enhanced_from_epoch19/epoch_23.pth \
|
|
--run-dir runs/phase4a_bev2x \
|
|
> $LOG_FILE 2>&1 &
|
|
|
|
TRAIN_PID=$!
|
|
|
|
echo "✅ 训练已启动"
|
|
echo ""
|
|
echo "========================================================================"
|
|
echo "训练信息:"
|
|
echo " PID: $TRAIN_PID"
|
|
echo " 日志: $LOG_FILE"
|
|
echo " 输出: runs/phase4a_bev2x/"
|
|
echo ""
|
|
echo "监控命令:"
|
|
echo " 实时日志: tail -f $LOG_FILE"
|
|
echo " 训练进度: bash monitor_phase4a.sh"
|
|
echo " GPU状态: nvidia-smi"
|
|
echo ""
|
|
echo "预计完成: 2025年11月12日"
|
|
echo "========================================================================"
|
|
echo ""
|
|
|
|
# 等待3秒后检查
|
|
sleep 3
|
|
if ps -p $TRAIN_PID > /dev/null; then
|
|
echo "✅ 训练进程运行中 (PID: $TRAIN_PID)"
|
|
else
|
|
echo "⚠️ 训练进程未运行,请检查日志"
|
|
tail -20 $LOG_FILE
|
|
fi
|
|
|
|
|
|
|
|
|