73 lines
2.3 KiB
Bash
Executable File
73 lines
2.3 KiB
Bash
Executable File
#!/bin/bash
|
||
# 从epoch_19开始训练Enhanced版本
|
||
# 使用4个GPU以避免分布式同步死锁问题
|
||
|
||
set -e
|
||
|
||
export PATH=/opt/conda/bin:$PATH
|
||
cd /workspace/bevfusion
|
||
|
||
echo "=========================================="
|
||
echo "BEVFusion Enhanced训练 - 从Epoch 19继续"
|
||
echo "任务: 3D检测 + BEV分割(Enhanced Head)"
|
||
echo "=========================================="
|
||
echo ""
|
||
echo "配置信息:"
|
||
echo " GPU数量: 4x Tesla V100S (减少以避免死锁)"
|
||
echo " 基础模型: epoch_19.pth"
|
||
echo " 分割头: EnhancedBEVSegmentationHead"
|
||
echo " 特性: ASPP + 注意力 + Deep Supervision"
|
||
echo ""
|
||
echo "优化措施:"
|
||
echo " - 使用4个GPU而非8个(降低同步复杂度)"
|
||
echo " - workers_per_gpu=0(避免共享内存问题)"
|
||
echo " - 较小学习率(微调)"
|
||
echo ""
|
||
echo "预计训练时间: 6-8小时完成剩余1个epoch"
|
||
echo "预期性能提升:"
|
||
echo " - 分割mIoU: 36% → 48-52%"
|
||
echo " - 检测mAP: 保持65-68%"
|
||
echo ""
|
||
echo "=========================================="
|
||
echo ""
|
||
|
||
# 生成日志文件名
|
||
LOG_FILE="training_enhanced_from_epoch19_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo "训练日志将保存到: $LOG_FILE"
|
||
echo ""
|
||
echo "开始训练..."
|
||
echo ""
|
||
|
||
# 启动训练
|
||
torchpack dist-run -np 4 python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from runs/run-326653dc-74184412/epoch_19.pth \
|
||
--model.heads.map.type EnhancedBEVSegmentationHead \
|
||
--model.heads.map.in_channels 512 \
|
||
--model.heads.map.hidden_channels 256 \
|
||
--model.heads.map.num_classes 6 \
|
||
--model.heads.map.use_aspp true \
|
||
--model.heads.map.use_channel_attention true \
|
||
--model.heads.map.use_spatial_attention true \
|
||
--model.heads.map.deep_supervision true \
|
||
--model.heads.map.focal_alpha 0.25 \
|
||
--model.heads.map.focal_gamma 2.0 \
|
||
--model.heads.map.use_dice_loss true \
|
||
--model.heads.map.dice_weight 0.5 \
|
||
--model.heads.map.class_weights '[1.0, 3.0, 1.5, 4.0, 2.0, 3.0]' \
|
||
--model.loss_scale.map 5.0 \
|
||
--optimizer.lr 1.0e-4 \
|
||
--data.workers_per_gpu 0 \
|
||
--data.samples_per_gpu 2 \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
echo ""
|
||
echo "=========================================="
|
||
echo "训练完成!"
|
||
echo "日志文件: $LOG_FILE"
|
||
echo "=========================================="
|
||
|
||
|