bev-project/archive_scripts/start_enhanced_from_epoch19.sh

73 lines
2.3 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# 从epoch_19开始训练Enhanced版本
# 使用4个GPU以避免分布式同步死锁问题
set -e
export PATH=/opt/conda/bin:$PATH
cd /workspace/bevfusion
echo "=========================================="
echo "BEVFusion Enhanced训练 - 从Epoch 19继续"
echo "任务: 3D检测 + BEV分割Enhanced Head"
echo "=========================================="
echo ""
echo "配置信息:"
echo " GPU数量: 4x Tesla V100S (减少以避免死锁)"
echo " 基础模型: epoch_19.pth"
echo " 分割头: EnhancedBEVSegmentationHead"
echo " 特性: ASPP + 注意力 + Deep Supervision"
echo ""
echo "优化措施:"
echo " - 使用4个GPU而非8个降低同步复杂度"
echo " - workers_per_gpu=0避免共享内存问题"
echo " - 较小学习率(微调)"
echo ""
echo "预计训练时间: 6-8小时完成剩余1个epoch"
echo "预期性能提升:"
echo " - 分割mIoU: 36% → 48-52%"
echo " - 检测mAP: 保持65-68%"
echo ""
echo "=========================================="
echo ""
# 生成日志文件名
LOG_FILE="training_enhanced_from_epoch19_$(date +%Y%m%d_%H%M%S).log"
echo "训练日志将保存到: $LOG_FILE"
echo ""
echo "开始训练..."
echo ""
# 启动训练
torchpack dist-run -np 4 python tools/train.py \
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask.yaml \
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
--load_from runs/run-326653dc-74184412/epoch_19.pth \
--model.heads.map.type EnhancedBEVSegmentationHead \
--model.heads.map.in_channels 512 \
--model.heads.map.hidden_channels 256 \
--model.heads.map.num_classes 6 \
--model.heads.map.use_aspp true \
--model.heads.map.use_channel_attention true \
--model.heads.map.use_spatial_attention true \
--model.heads.map.deep_supervision true \
--model.heads.map.focal_alpha 0.25 \
--model.heads.map.focal_gamma 2.0 \
--model.heads.map.use_dice_loss true \
--model.heads.map.dice_weight 0.5 \
--model.heads.map.class_weights '[1.0, 3.0, 1.5, 4.0, 2.0, 3.0]' \
--model.loss_scale.map 5.0 \
--optimizer.lr 1.0e-4 \
--data.workers_per_gpu 0 \
--data.samples_per_gpu 2 \
2>&1 | tee "$LOG_FILE"
echo ""
echo "=========================================="
echo "训练完成!"
echo "日志文件: $LOG_FILE"
echo "=========================================="