82 lines
3.0 KiB
Bash
Executable File
82 lines
3.0 KiB
Bash
Executable File
#!/bin/bash
|
||
# Phase 4A: BEV 2x分辨率提升训练 - 环境修复版
|
||
|
||
set -e
|
||
|
||
# 设置完整的环境变量
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
# 确保符号链接存在
|
||
if [ ! -L "/opt/conda/lib/python3.8/site-packages/torch/lib/libtorch_cuda_cu.so" ]; then
|
||
echo "创建必要的符号链接..."
|
||
cd /opt/conda/lib/python3.8/site-packages/torch/lib
|
||
ln -sf libtorch_cuda.so libtorch_cuda_cu.so
|
||
ln -sf libtorch_cuda.so libtorch_cuda_cpp.so
|
||
ln -sf libtorch_cpu.so libtorch_cpu_cpp.so
|
||
fi
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
echo "========================================================================"
|
||
echo "Phase 4A: BEV 2x分辨率提升训练"
|
||
echo "========================================================================"
|
||
echo "配置: BEV 0.15m分辨率 (2倍提升)"
|
||
echo "Decoder: 4层完整版 [256, 256, 128, 128]"
|
||
echo "从epoch_23.pth加载所有权重"
|
||
echo "========================================================================"
|
||
echo ""
|
||
echo "关键配置:"
|
||
echo " - BEV分辨率: 0.3m → 0.15m (720×720)"
|
||
echo " - GT标签: 0.25m → 0.125m (800×800)"
|
||
echo " - Decoder: 2层 → 4层"
|
||
echo " - Deep Supervision: 启用"
|
||
echo " - Dice Loss: 启用"
|
||
echo " - GPU数量: 4 (显存优化)"
|
||
echo " - Batch: 1/GPU (显存限制)"
|
||
echo " - Workers: 4 (参考Phase 3成功配置)"
|
||
echo ""
|
||
echo "预期性能提升:"
|
||
echo " - Stop Line IoU: 0.27 → 0.42+ (+55%)"
|
||
echo " - Divider IoU: 0.19 → 0.35+ (+84%)"
|
||
echo " - 整体mIoU: 0.41 → 0.54+ (+32%)"
|
||
echo ""
|
||
echo "预计训练时间: 12.5天"
|
||
echo "========================================================================"
|
||
echo ""
|
||
|
||
# 验证环境
|
||
echo "环境验证..."
|
||
python -c "import torch; print('✓ PyTorch:', torch.__version__)"
|
||
python -c "from mmcv.ops import nms_match; import mmcv; print('✓ mmcv:', mmcv.__version__)" || {
|
||
echo "❌ mmcv加载失败,请检查环境"
|
||
exit 1
|
||
}
|
||
echo "✓ 环境验证成功"
|
||
echo ""
|
||
|
||
# 创建日志文件
|
||
LOG_FILE="phase4a_bev2x_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo "开始训练..."
|
||
echo "日志文件: $LOG_FILE"
|
||
echo ""
|
||
|
||
# 使用环境变量包装的方式启动训练
|
||
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
|
||
PATH=/opt/conda/bin:$PATH \
|
||
/opt/conda/bin/torchpack dist-run -np 4 /opt/conda/bin/python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from runs/enhanced_from_epoch19/epoch_23.pth \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
echo "训练完成!日志保存在: $LOG_FILE"
|
||
echo "========================================================================"
|
||
|