58 lines
2.2 KiB
Bash
Executable File
58 lines
2.2 KiB
Bash
Executable File
#!/bin/bash
|
||
# Phase 4A Stage 1: 从epoch_1.pth加载权重重新开始训练
|
||
|
||
set -e
|
||
|
||
export PATH=/opt/conda/bin:$PATH
|
||
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
|
||
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
|
||
|
||
cd /workspace/bevfusion
|
||
|
||
echo "========================================================================"
|
||
echo "Phase 4A Stage 1: 从epoch_1.pth重新开始训练 (8 GPUs)"
|
||
echo "========================================================================"
|
||
echo "加载权重: epoch_1.pth (已训练过600×600的模型)"
|
||
echo "训练Epochs: 1-10"
|
||
echo "输出目录: /data/runs/phase4a_stage1"
|
||
echo "GPU配置: 8×Tesla V100S-32GB"
|
||
echo "========================================================================"
|
||
|
||
# 环境验证
|
||
python -c "import torch; print('✓ PyTorch:', torch.__version__)"
|
||
python -c "from mmcv.ops import nms_match; import mmcv; print('✓ mmcv:', mmcv.__version__)" || exit 1
|
||
echo "✓ 环境验证成功"
|
||
|
||
# 确认文件存在
|
||
if [ ! -f "/data/runs/phase4a_stage1/epoch_1.pth" ]; then
|
||
echo "❌ 找不到 /data/runs/phase4a_stage1/epoch_1.pth"
|
||
exit 1
|
||
fi
|
||
echo "✓ epoch_1.pth已就绪"
|
||
|
||
LOG_FILE="phase4a_stage1_new_$(date +%Y%m%d_%H%M%S).log"
|
||
|
||
echo ""
|
||
echo "开始训练..."
|
||
echo "日志文件: $LOG_FILE"
|
||
echo ""
|
||
|
||
# 从epoch_1.pth加载权重,重新开始训练(不resume)- 使用8卡
|
||
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
|
||
PATH=/opt/conda/bin:$PATH \
|
||
PYTHONPATH=/workspace/bevfusion:$PYTHONPATH \
|
||
/opt/conda/bin/torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
|
||
configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_BEV2X_phase4a_stage1.yaml \
|
||
--model.encoders.camera.backbone.init_cfg.checkpoint /data/pretrained/swint-nuimages-pretrained.pth \
|
||
--load_from /data/runs/phase4a_stage1/epoch_1.pth \
|
||
--data.samples_per_gpu 1 \
|
||
--data.workers_per_gpu 0 \
|
||
2>&1 | tee "$LOG_FILE"
|
||
|
||
echo ""
|
||
echo "========================================================================"
|
||
echo "训练完成!日志: $LOG_FILE"
|
||
echo "Checkpoints: /data/runs/phase4a_stage1/"
|
||
echo "========================================================================"
|
||
|