bev-project/archive/scripts_old/EVAL_EPOCH23_FIXED.sh

96 lines
3.2 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Epoch 23评估 - 修复版同Docker并行使用GPU 4-7
# 关键修复: workers=0, CUDA_VISIBLE_DEVICES=4,5,6,7
set -e
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
cd /workspace/bevfusion
echo "========================================================================"
echo "Phase 3 Epoch 23 评估GPU 4-7并行"
echo "========================================================================"
echo "Checkpoint: epoch_23.pth"
echo "使用GPU: 4-7 (不影响GPU 0-3的训练)"
echo "Workers: 0 (避免共享内存问题)"
echo "========================================================================"
echo ""
# 创建评估输出目录
EVAL_DIR="eval_results/epoch23_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$EVAL_DIR"
CONFIG="configs/nuscenes/det/transfusion/secfpn/camera+lidar/swint_v0p075/multitask_enhanced_phase1_HIGHRES.yaml"
CHECKPOINT="runs/enhanced_from_epoch19/epoch_23.pth"
echo "配置: $CONFIG"
echo "Checkpoint: $CHECKPOINT"
echo "输出目录: $EVAL_DIR"
echo ""
# 验证训练仍在运行
echo "验证训练状态..."
TRAIN_PROCS=$(ps aux | grep "train.py" | grep python | grep -v grep | wc -l)
if [ "$TRAIN_PROCS" -gt 0 ]; then
echo "✓ 训练正常运行中 ($TRAIN_PROCS 进程)"
else
echo "⚠️ 警告: 未检测到训练进程"
fi
echo ""
echo "开始评估..."
echo "预计时间: 2-3小时"
echo "日志文件: $EVAL_DIR/eval.log"
echo ""
# 关键配置:
# - CUDA_VISIBLE_DEVICES=4,5,6,7: 只使用GPU 4-7
# - workers_per_gpu=0: 避免共享内存问题
# - -np 4: 使用4个进程对应4张GPU
CUDA_VISIBLE_DEVICES=4,5,6,7 \
LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH \
PATH=/opt/conda/bin:$PATH \
/opt/conda/bin/torchpack dist-run -np 4 /opt/conda/bin/python tools/test.py \
"$CONFIG" \
"$CHECKPOINT" \
--eval bbox \
--out "$EVAL_DIR/results.pkl" \
--cfg-options data.workers_per_gpu=0 data.samples_per_gpu=1 \
2>&1 | tee "$EVAL_DIR/eval.log"
echo ""
echo "========================================================================"
echo "评估完成!"
echo "========================================================================"
echo "结果文件: $EVAL_DIR/results.pkl"
echo "日志文件: $EVAL_DIR/eval.log"
echo ""
# 提取关键性能指标
echo "========================================================================"
echo "性能指标摘要:"
echo "========================================================================"
grep -E "(NDS|mAP|mIoU|IoU)" "$EVAL_DIR/eval.log" | grep -v "UserWarning" | tail -50
echo ""
echo "完整结果请查看: $EVAL_DIR/eval.log"
echo "========================================================================"
# 验证训练仍在运行
TRAIN_PROCS_AFTER=$(ps aux | grep "train.py" | grep python | grep -v grep | wc -l)
echo ""
echo "训练状态确认: $TRAIN_PROCS_AFTER 进程运行中"
if [ "$TRAIN_PROCS_AFTER" -lt "$TRAIN_PROCS" ]; then
echo "⚠️ 警告: 训练进程数减少,请检查训练状态!"
else
echo "✓ 训练未受影响"
fi
echo "========================================================================"