bev-project/archive_scripts/START_PHASE4A_FIXED.sh

82 lines
3.0 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4A: BEV 2x分辨率提升训练 - 环境修复版
set -e
# 设置完整的环境变量
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
# 确保符号链接存在
if [ ! -L "/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda_cu.so" ]; then
echo "创建必要的符号链接..."
cd /opt/conda/lib/python3.8/site-packages/torch/lib
ln -sf libtorch_cuda.so libtorch_cuda_cu.so
ln -sf libtorch_cuda.so libtorch_cuda_cpp.so
ln -sf libtorch_cpu.so libtorch_cpu_cpp.so
fi
cd /workspace/bevfusion
echo "========================================================================"
echo "Phase 4A: BEV 2x分辨率提升训练"
echo "========================================================================"
echo "配置: BEV 0.15m分辨率 (2倍提升)"
echo "Decoder: 4层完整版 [256, 256, 128, 128]"
echo "从epoch_23.pth加载所有权重"
echo "========================================================================"
echo ""
echo "关键配置:"
echo " - BEV分辨率: 0.3m → 0.15m (720×720)"
echo " - GT标签: 0.25m → 0.125m (800×800)"
echo " - Decoder: 2层 → 4层"
echo " - Deep Supervision: 启用"
echo " - Dice Loss: 启用"
echo " - GPU数量: 4 (显存优化)"
echo " - Batch: 1/GPU (显存限制)"
echo " - Workers: 4 (参考Phase 3成功配置)"
echo ""
echo "预期性能提升:"
echo " - Stop Line IoU: 0.27 → 0.42+ (+55%)"
echo " - Divider IoU: 0.19 → 0.35+ (+84%)"
echo " - 整体mIoU: 0.41 → 0.54+ (+32%)"
echo ""
echo "预计训练时间: 12.5天"
echo "========================================================================"
echo ""
# 验证环境
echo "环境验证..."
python -c "import torch; print('✓ PyTorch:', torch.__version__)"
python -c "from mmcv.ops import nms_match; import mmcv; print('✓ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv加载失败请检查环境"
exit 1
}
echo "✓ 环境验证成功"
echo ""
# 创建日志文件
LOG_FILE="phase4a_bev2x_$(date +%Y%m%d_%H%M%S).log"
echo "开始训练..."
echo "日志文件: $LOG_FILE"
echo ""
# 使用环境变量包装的方式启动训练
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
PATH=/opt/conda/bin:$PATH \
/opt/conda/bin/torchpack dist-run -np 4 /opt/conda/bin/python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
--load_from runs/enhanced_from_epoch19/epoch_23.pth \
--data.samples_per_gpu 1 \
--data.workers_per_gpu 0 \
2>&1 | tee "$LOG_FILE"
echo ""
echo "========================================================================"
echo "训练完成!日志保存在: $LOG_FILE"
echo "========================================================================"