106 lines
3.4 KiB
Bash
Executable File
106 lines
3.4 KiB
Bash
Executable File
#!/bin/bash
|
||
# BEVFusion Phase 4A Stage 1 - FP16优化训练
|
||
# 显存优化: 29GB → 20GB
|
||
# 训练加速: 33% (6.5天完成)
|
||
|
||
set -e
|
||
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
echo "========================================================================"
|
||
echo "Phase 4A Stage 1: FP16优化训练 (8 GPUs)"
|
||
echo "========================================================================"
|
||
echo "优化方案:"
|
||
echo " ✓ FP16混合精度训练"
|
||
echo " ✓ Batch: 1→4/GPU (总batch 8→32)"
|
||
echo " ✓ Workers: 0→2/GPU"
|
||
echo " ✓ 学习率: 2e-5→4e-5"
|
||
echo ""
|
||
echo "预期效果:"
|
||
echo " 显存: 29GB → 20GB (节省9GB)"
|
||
echo " 速度: +33% (7.5h/epoch vs 11h)"
|
||
echo " 完成: 6.5天 (vs 9.5天)"
|
||
echo "========================================================================"
|
||
|
||
# 环境验证
|
||
echo ""
|
||
echo "【环境检查】"
|
||
python -c "import torch; print('✓ PyTorch:', torch.__version__)"
|
||
python -c "import torch; print('✓ FP16支持:', 'YES' if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7 else 'NO')"
|
||
python -c "from mmcv.ops import nms_match; import mmcv; print('✓ mmcv:', mmcv.__version__)" || exit 1
|
||
|
||
# 检查GPU
|
||
GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)
|
||
echo "✓ GPU数量: $GPU_COUNT"
|
||
|
||
if [ "$GPU_COUNT" -ne 8 ]; then
|
||
echo "⚠️ 警告: 检测到${GPU_COUNT}张GPU,配置为8卡训练"
|
||
fi
|
||
|
||
# 确认文件存在
|
||
if [ ! -f "/data/runs/phase4a_stage1/epoch_1.pth" ]; then
|
||
echo "❌ 找不到 /data/runs/phase4a_stage1/epoch_1.pth"
|
||
exit 1
|
||
fi
|
||
echo "✓ epoch_1.pth已就绪"
|
||
echo ""
|
||
|
||
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_fp16_batch4.yaml"
|
||
LOG_FILE="phase4a_stage1_fp16_batch4_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo "【训练配置】"
|
||
echo " 配置文件: $CONFIG"
|
||
echo " 日志文件: $LOG_FILE"
|
||
echo " 输出目录: /data/runs/phase4a_stage1_fp16_batch4"
|
||
echo ""
|
||
|
||
read -p "是否开始训练? (y/n) " -n 1 -r
|
||
echo
|
||
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
|
||
echo "已取消"
|
||
exit 0
|
||
fi
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
echo "开始FP16优化训练..."
|
||
echo "========================================================================"
|
||
echo ""
|
||
|
||
# 启动训练
|
||
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
|
||
PATH=/opt/conda/bin:$PATH \
|
||
PYTHONPATH=/workspace/bevfusion:$PYTHONPATH \
|
||
/opt/conda/bin/torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
|
||
${CONFIG} \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint /data/pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from /data/runs/phase4a_stage1/epoch_1.pth \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
TRAIN_EXIT_CODE=$?
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
if [ $TRAIN_EXIT_CODE -eq 0 ]; then
|
||
echo "✅ 训练完成!"
|
||
echo ""
|
||
echo "检查显存占用:"
|
||
nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
|
||
echo ""
|
||
else
|
||
echo "❌ 训练异常退出 (exit code: $TRAIN_EXIT_CODE)"
|
||
fi
|
||
echo "========================================================================"
|
||
echo ""
|
||
echo "日志文件: $LOG_FILE"
|
||
echo "Checkpoints: /data/runs/phase4a_stage1_fp16_batch4/"
|
||
echo ""
|
||
|
||
exit $TRAIN_EXIT_CODE
|
||
|
||
|