bev-project/archive/scripts_old/RESTART_FP32_STABLE.sh

83 lines
2.8 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A Stage 1 - 恢复FP32稳定训练
# 使用之前成功的配置
set -e
cd /workspace/bevfusion
echo "=========================================================================="
echo "Phase 4A Stage 1: 恢复FP32稳定训练"
echo "=========================================================================="
echo "配置: 8×V100S, 600×600 BEV, FP32精度, Batch=1/GPU"
echo "起点: epoch_1.pth"
echo ""
echo "配置特点:"
echo " ✓ 双任务: 3D检测 + BEV分割"
echo " ✓ 分割头: EnhancedBEVSegmentationHead (4层Decoder)"
echo " ✓ Deep Supervision + Dice Loss"
echo " ✓ 已验证稳定有效"
echo ""
echo "预计时间: ~9天完成10 epochs"
echo "=========================================================================="
# 环境变量
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# 环境检查
echo ""
echo "【环境检查】"
/opt/conda/bin/python -c "import torch; print('✓ PyTorch:', torch.__version__)"
/opt/conda/bin/python -c "import mmcv; print('✓ mmcv:', mmcv.__version__)"
nvidia-smi -L | wc -l | xargs echo "✓ GPU数量:"
# Checkpoint检查
echo ""
echo "【Checkpoint检查】"
if [ -f "/data/runs/phase4a_stage1/epoch_1.pth" ]; then
ls -lh /data/runs/phase4a_stage1/epoch_1.pth | awk '{print "✓ epoch_1.pth:", $5}'
else
echo "❌ epoch_1.pth 不存在!"
exit 1
fi
if [ -f "/data/pretrained/swint-nuimages-pretrained.pth" ]; then
ls -lh /data/pretrained/swint-nuimages-pretrained.pth | awk '{print "✓ 预训练模型:", $5}'
else
echo "❌ 预训练模型不存在!"
exit 1
fi
# 确保work_dir存在
mkdir -p /data/runs/phase4a_stage1
# 生成日志文件名
LOG_FILE="phase4a_stage1_fp32_resume_$(date +%Y%m%d_%H%M%S).log"
echo ""
echo "【开始FP32训练】"
echo "配置文件: multitask_BEV2X_phase4a_stage1.yaml"
echo "日志文件: $LOG_FILE"
echo "输出目录: /data/runs/phase4a_stage1"
echo ""
# 启动FP32训练
torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint /data/pretrained/swint-nuimages-pretrained.pth \
--load_from /data/runs/phase4a_stage1/epoch_1.pth \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
--cfg-options work_dir=/data/runs/phase4a_stage1 \
2>&1 | tee "$LOG_FILE"
echo ""
echo "=========================================================================="
echo "训练结束!"
echo "日志: $LOG_FILE"
echo "Checkpoints: /data/runs/phase4a_stage1/"
echo "=========================================================================="