83 lines
2.8 KiB
Bash
Executable File
83 lines
2.8 KiB
Bash
Executable File
#!/bin/bash
|
||
# Phase 4A Stage 1 - 恢复FP32稳定训练
|
||
# 使用之前成功的配置
|
||
|
||
set -e
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
echo "=========================================================================="
|
||
echo "Phase 4A Stage 1: 恢复FP32稳定训练"
|
||
echo "=========================================================================="
|
||
echo "配置: 8×V100S, 600×600 BEV, FP32精度, Batch=1/GPU"
|
||
echo "起点: epoch_1.pth"
|
||
echo ""
|
||
echo "配置特点:"
|
||
echo " ✓ 双任务: 3D检测 + BEV分割"
|
||
echo " ✓ 分割头: EnhancedBEVSegmentationHead (4层Decoder)"
|
||
echo " ✓ Deep Supervision + Dice Loss"
|
||
echo " ✓ 已验证稳定有效"
|
||
echo ""
|
||
echo "预计时间: ~9天完成10 epochs"
|
||
echo "=========================================================================="
|
||
|
||
# 环境变量
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
# 环境检查
|
||
echo ""
|
||
echo "【环境检查】"
|
||
/opt/conda/bin/python -c "import torch; print('✓ PyTorch:', torch.__version__)"
|
||
/opt/conda/bin/python -c "import mmcv; print('✓ mmcv:', mmcv.__version__)"
|
||
nvidia-smi -L | wc -l | xargs echo "✓ GPU数量:"
|
||
|
||
# Checkpoint检查
|
||
echo ""
|
||
echo "【Checkpoint检查】"
|
||
if [ -f "/data/runs/phase4a_stage1/epoch_1.pth" ]; then
|
||
ls -lh /data/runs/phase4a_stage1/epoch_1.pth | awk '{print "✓ epoch_1.pth:", $5}'
|
||
else
|
||
echo "❌ epoch_1.pth 不存在!"
|
||
exit 1
|
||
fi
|
||
|
||
if [ -f "/data/pretrained/swint-nuimages-pretrained.pth" ]; then
|
||
ls -lh /data/pretrained/swint-nuimages-pretrained.pth | awk '{print "✓ 预训练模型:", $5}'
|
||
else
|
||
echo "❌ 预训练模型不存在!"
|
||
exit 1
|
||
fi
|
||
|
||
# 确保work_dir存在
|
||
mkdir -p /data/runs/phase4a_stage1
|
||
|
||
# 生成日志文件名
|
||
LOG_FILE="phase4a_stage1_fp32_resume_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo ""
|
||
echo "【开始FP32训练】"
|
||
echo "配置文件: multitask_BEV2X_phase4a_stage1.yaml"
|
||
echo "日志文件: $LOG_FILE"
|
||
echo "输出目录: /data/runs/phase4a_stage1"
|
||
echo ""
|
||
|
||
# 启动FP32训练
|
||
torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint /data/pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from /data/runs/phase4a_stage1/epoch_1.pth \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
--cfg-options work_dir=/data/runs/phase4a_stage1 \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
echo ""
|
||
echo "=========================================================================="
|
||
echo "训练结束!"
|
||
echo "日志: $LOG_FILE"
|
||
echo "Checkpoints: /data/runs/phase4a_stage1/"
|
||
echo "=========================================================================="
|
||
|