bev-project/archive/scripts_old/START_OPTIMIZED_TRAINING.sh

106 lines
3.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# BEVFusion Phase 4A Stage 1 - FP16优化训练
# 显存优化: 29GB → 20GB
# 训练加速: 33% (6.5天完成)
set -e
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
cd /workspace/bevfusion
echo "========================================================================"
echo "Phase 4A Stage 1: FP16优化训练 (8 GPUs)"
echo "========================================================================"
echo "优化方案:"
echo " ✓ FP16混合精度训练"
echo " ✓ Batch: 1→4/GPU (总batch 8→32)"
echo " ✓ Workers: 0→2/GPU"
echo " ✓ 学习率: 2e-5→4e-5"
echo ""
echo "预期效果:"
echo " 显存: 29GB → 20GB (节省9GB)"
echo " 速度: +33% (7.5h/epoch vs 11h)"
echo " 完成: 6.5天 (vs 9.5天)"
echo "========================================================================"
# 环境验证
echo ""
echo "【环境检查】"
python -c "import torch; print('✓ PyTorch:', torch.__version__)"
python -c "import torch; print('✓ FP16支持:', 'YES' if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 7 else 'NO')"
python -c "from mmcv.ops import nms_match; import mmcv; print('✓ mmcv:', mmcv.__version__)" || exit 1
# 检查GPU
GPU_COUNT=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)
echo "✓ GPU数量: $GPU_COUNT"
if [ "$GPU_COUNT" -ne 8 ]; then
echo "⚠️ 警告: 检测到${GPU_COUNT}张GPU配置为8卡训练"
fi
# 确认文件存在
if [ ! -f "/data/runs/phase4a_stage1/epoch_1.pth" ]; then
echo "❌ 找不到 /data/runs/phase4a_stage1/epoch_1.pth"
exit 1
fi
echo "✓ epoch_1.pth已就绪"
echo ""
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1_fp16_batch4.yaml"
LOG_FILE="phase4a_stage1_fp16_batch4_$(date +%Y%m%d_%H%M%S).log"
echo "【训练配置】"
echo " 配置文件: $CONFIG"
echo " 日志文件: $LOG_FILE"
echo " 输出目录: /data/runs/phase4a_stage1_fp16_batch4"
echo ""
read -p "是否开始训练? (y/n) " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo "已取消"
exit 0
fi
echo ""
echo "========================================================================"
echo "开始FP16优化训练..."
echo "========================================================================"
echo ""
# 启动训练
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
PATH=/opt/conda/bin:$PATH \
PYTHONPATH=/workspace/bevfusion:$PYTHONPATH \
/opt/conda/bin/torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
${CONFIG} \
--model.encoders.camera.backbone.init_cfg.checkpoint /data/pretrained/swint-nuimages-pretrained.pth \
--load_from /data/runs/phase4a_stage1/epoch_1.pth \
2>&1 | tee "$LOG_FILE"
TRAIN_EXIT_CODE=$?
echo ""
echo "========================================================================"
if [ $TRAIN_EXIT_CODE -eq 0 ]; then
echo "✅ 训练完成!"
echo ""
echo "检查显存占用:"
nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv
echo ""
else
echo "❌ 训练异常退出 (exit code: $TRAIN_EXIT_CODE)"
fi
echo "========================================================================"
echo ""
echo "日志文件: $LOG_FILE"
echo "Checkpoints: /data/runs/phase4a_stage1_fp16_batch4/"
echo ""
exit $TRAIN_EXIT_CODE