bev-project/archive_scripts/一键剪枝和微调.sh

151 lines
4.3 KiB
Bash
Raw Normal View History

#!/bin/bash
# 一键执行剪枝+微调流程
set -e
cd /workspace/bevfusion
echo "========================================================================"
echo "BEVFusion 模型剪枝+微调 完整流程"
echo "========================================================================"
echo ""
echo "Baseline: Epoch 23"
echo " - 参数量: 45.72M"
echo " - NDS: 0.6941, mAP: 0.6446, mIoU: 0.4130"
echo ""
echo "目标: 剪枝30% → 32M参数"
echo "预期: 精度损失<2%"
echo ""
# 配置
CHECKPOINT="runs/enhanced_from_epoch19/epoch_23.pth"
PRUNED_OUTPUT="pruning_results/bevfusion_pruned_32M.pth"
TARGET_RATIO=0.70 # 保留70%参数剪枝30%
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
FINETUNE_DIR="runs/pruned_finetune_${TIMESTAMP}"
# 检查checkpoint
if [ ! -f "$CHECKPOINT" ]; then
echo "错误: Checkpoint不存在: $CHECKPOINT"
exit 1
fi
# 创建输出目录
mkdir -p pruning_results
mkdir -p "$FINETUNE_DIR"
echo "========================================================================"
echo "阶段1: 模型剪枝预计15分钟"
echo "========================================================================"
echo ""
# 执行剪枝
/opt/conda/bin/python tools/pruning/prune_bevfusion_builtin.py \
--checkpoint "$CHECKPOINT" \
--output "$PRUNED_OUTPUT" \
--target-ratio $TARGET_RATIO \
2>&1 | tee "pruning_results/pruning_log_${TIMESTAMP}.txt"
if [ $? -ne 0 ]; then
echo "错误: 剪枝失败"
exit 1
fi
echo ""
echo "✅ 剪枝完成!"
echo " 输出: $PRUNED_OUTPUT"
echo ""
# 询问是否继续微调
echo "========================================================================"
echo "阶段2: 微调训练预计12-15小时"
echo "========================================================================"
echo ""
echo "是否立即开始微调训练?"
echo ""
echo "微调配置:"
echo " - Epochs: 3"
echo " - 学习率: 5e-6 (很小)"
echo " - GPU: 8张"
echo " - 预计时间: 12-15小时"
echo ""
echo "选项:"
echo " [1] 立即开始微调(后台运行)"
echo " [2] 稍后手动启动"
echo " [3] 查看剪枝结果后再决定"
echo ""
read -p "请选择 [1/2/3]: " choice
case $choice in
1)
echo ""
echo "启动微调训练..."
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml"
# 后台启动微调
nohup /opt/conda/bin/torchpack dist-run -np 8 /opt/conda/bin/python tools/train.py \
"$CONFIG" \
--load_from "$PRUNED_OUTPUT" \
--run-dir "$FINETUNE_DIR" \
--cfg-options \
max_epochs=3 \
optimizer.lr=5.0e-6 \
data.samples_per_gpu=2 \
data.workers_per_gpu=0 \
2>&1 | tee "${FINETUNE_DIR}/finetune.log" &
FINETUNE_PID=$!
echo $FINETUNE_PID > pruning_results/finetune.pid
echo ""
echo "✅ 微调训练已启动(后台运行)"
echo " PID: $FINETUNE_PID"
echo " 日志: ${FINETUNE_DIR}/finetune.log"
echo ""
echo "监控命令:"
echo " tail -f ${FINETUNE_DIR}/finetune.log | grep 'Epoch'"
echo ""
;;
2)
echo ""
echo "稍后手动启动微调。"
echo ""
echo "启动命令:"
echo " torchpack dist-run -np 8 python tools/train.py \\"
echo " configs/.../multitask_enhanced_phase1_HIGHRES.yaml \\"
echo " --load_from $PRUNED_OUTPUT \\"
echo " --cfg-options max_epochs=3 optimizer.lr=5.0e-6"
echo ""
;;
3)
echo ""
echo "查看剪枝结果:"
echo " python tools/analysis/analyze_checkpoint.py $PRUNED_OUTPUT"
echo ""
echo "如果满意,再启动微调。"
echo ""
;;
*)
echo "无效选择,退出。"
exit 1
;;
esac
echo "========================================================================"
echo "剪枝流程完成"
echo "========================================================================"
echo ""
echo "已生成文件:"
echo " - $PRUNED_OUTPUT"
echo " - pruning_results/pruning_log_${TIMESTAMP}.txt"
echo ""
echo "下一步:"
echo " 1. 等待微调完成(如已启动)"
echo " 2. 评估剪枝+微调后的模型"
echo " 3. 进行INT8量化"
echo ""