bev-project/EVAL_EPOCH1_TEST.sh

126 lines
3.9 KiB
Bash
Raw Permalink Normal View History

2025-11-21 10:50:51 +08:00
#!/bin/bash
# Phase 4B RMT-PPAD Epoch 1 评估测试脚本 - 验证修复后的配置
set -e
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
cd /workspace/bevfusion
echo "========================================================================"
echo "Phase 4B RMT-PPAD Epoch 1 评估测试"
echo "========================================================================"
echo "目的: 验证修复后的test_pipeline是否正常工作"
echo "Checkpoint: runs/run-4c8ec7e5-fabdc997/epoch_1.pth (最新)"
echo "配置: multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
echo "========================================================================"
echo ""
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 创建评估输出目录
EVAL_DIR="eval_test/epoch1_test_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$EVAL_DIR"
CONFIG="runs/run-4c8ec7e5-fabdc997/configs.yaml"
CHECKPOINT="runs/run-4c8ec7e5-fabdc997/epoch_1.pth"
echo "配置文件: $CONFIG"
echo "Checkpoint: $CHECKPOINT"
echo "输出目录: $EVAL_DIR"
echo ""
# 检查文件存在
if [ ! -f "$CONFIG" ]; then
echo "❌ 配置文件不存在: $CONFIG"
exit 1
fi
if [ ! -f "$CHECKPOINT" ]; then
echo "❌ Checkpoint不存在: $CHECKPOINT"
exit 1
fi
echo "✓ 文件检查通过"
echo ""
# 检查GPU
GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo "0")
echo "可用GPU数量: $GPU_COUNT"
if [ "$GPU_COUNT" -eq 0 ]; then
echo "⚠️ 没有GPU使用CPU模式"
GPU_COUNT=1
fi
echo ""
echo "开始评估测试..."
echo "测试模式: 仅处理前10个样本 (data.samples_per_gpu=1, 总共10个batch)"
echo "预计时间: 5-10分钟"
echo "日志文件: $EVAL_DIR/eval_test.log"
echo ""
# 只测试前10个样本验证pipeline是否工作
# 使用torchpack分布式运行 (参考训练脚本)
torchpack dist-run \
-np 1 \
/opt/conda/bin/python tools/test.py \
"$CONFIG" \
"$CHECKPOINT" \
--eval bbox map \
--out "$EVAL_DIR/test_results.pkl" \
--cfg-options data.test.samples_per_gpu=1 data.workers_per_gpu=0 \
2>&1 | tee "$EVAL_DIR/eval_test.log" | head -50
echo ""
echo "========================================================================"
echo "评估测试完成!"
echo "========================================================================"
echo "结果文件: $EVAL_DIR/test_results.pkl"
echo "日志文件: $EVAL_DIR/eval_test.log"
echo ""
# 检查是否成功完成
if grep -q "Evaluation results" "$EVAL_DIR/eval_test.log"; then
echo "✅ 评估成功test_pipeline修复有效"
echo ""
echo "========================================================================"
echo "关键指标 (前10个样本):"
echo "========================================================================"
grep -E "(NDS|mAP|mIoU|Car|Pedestrian|Divider|Divider Dice|Heatmap Loss)" "$EVAL_DIR/eval_test.log" | grep -v "UserWarning" | tail -20
else
echo "❌ 评估失败,检查日志文件"
echo "关键错误信息:"
grep -E "(ERROR|Error|Exception|KeyError|ImportError)" "$EVAL_DIR/eval_test.log" | tail -10
exit 1
fi
echo ""
echo "如果测试通过,可以安全启动完整训练!"
echo "========================================================================"