bev-project/EVAL_EPOCH1_TEST.sh

126 lines
3.9 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/bin/bash
# Phase 4B RMT-PPAD Epoch 1 评估测试脚本 - 验证修复后的配置
set -e
# ✅ 关键: 设置环境变量 (参考成功脚本)
export PATH=/opt/conda/bin:$PATH
export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH
#export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64
# 设置GPU
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
cd /workspace/bevfusion
echo "========================================================================"
echo "Phase 4B RMT-PPAD Epoch 1 评估测试"
echo "========================================================================"
echo "目的: 验证修复后的test_pipeline是否正常工作"
echo "Checkpoint: runs/run-4c8ec7e5-fabdc997/epoch_1.pth (最新)"
echo "配置: multitask_BEV2X_phase4b_rmtppad_segmentation.yaml"
echo "========================================================================"
echo ""
# 验证环境
echo ""
echo "=== 环境验证 ==="
/opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || {
echo "❌ PyTorch导入失败"
exit 1
}
/opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || {
echo "❌ mmcv导入失败"
exit 1
}
which torchpack || {
echo "❌ torchpack未找到"
exit 1
}
echo "✅ torchpack: $(which torchpack)"
# 创建评估输出目录
EVAL_DIR="eval_test/epoch1_test_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$EVAL_DIR"
CONFIG="runs/run-4c8ec7e5-fabdc997/configs.yaml"
CHECKPOINT="runs/run-4c8ec7e5-fabdc997/epoch_1.pth"
echo "配置文件: $CONFIG"
echo "Checkpoint: $CHECKPOINT"
echo "输出目录: $EVAL_DIR"
echo ""
# 检查文件存在
if [ ! -f "$CONFIG" ]; then
echo "❌ 配置文件不存在: $CONFIG"
exit 1
fi
if [ ! -f "$CHECKPOINT" ]; then
echo "❌ Checkpoint不存在: $CHECKPOINT"
exit 1
fi
echo "✓ 文件检查通过"
echo ""
# 检查GPU
GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo "0")
echo "可用GPU数量: $GPU_COUNT"
if [ "$GPU_COUNT" -eq 0 ]; then
echo "⚠️ 没有GPU使用CPU模式"
GPU_COUNT=1
fi
echo ""
echo "开始评估测试..."
echo "测试模式: 仅处理前10个样本 (data.samples_per_gpu=1, 总共10个batch)"
echo "预计时间: 5-10分钟"
echo "日志文件: $EVAL_DIR/eval_test.log"
echo ""
# 只测试前10个样本验证pipeline是否工作
# 使用torchpack分布式运行 (参考训练脚本)
torchpack dist-run \
-np 1 \
/opt/conda/bin/python tools/test.py \
"$CONFIG" \
"$CHECKPOINT" \
--eval bbox map \
--out "$EVAL_DIR/test_results.pkl" \
--cfg-options data.test.samples_per_gpu=1 data.workers_per_gpu=0 \
2>&1 | tee "$EVAL_DIR/eval_test.log" | head -50
echo ""
echo "========================================================================"
echo "评估测试完成!"
echo "========================================================================"
echo "结果文件: $EVAL_DIR/test_results.pkl"
echo "日志文件: $EVAL_DIR/eval_test.log"
echo ""
# 检查是否成功完成
if grep -q "Evaluation results" "$EVAL_DIR/eval_test.log"; then
echo "✅ 评估成功test_pipeline修复有效"
echo ""
echo "========================================================================"
echo "关键指标 (前10个样本):"
echo "========================================================================"
grep -E "(NDS|mAP|mIoU|Car|Pedestrian|Divider|Divider Dice|Heatmap Loss)" "$EVAL_DIR/eval_test.log" | grep -v "UserWarning" | tail -20
else
echo "❌ 评估失败,检查日志文件"
echo "关键错误信息:"
grep -E "(ERROR|Error|Exception|KeyError|ImportError)" "$EVAL_DIR/eval_test.log" | tail -10
exit 1
fi
echo ""
echo "如果测试通过,可以安全启动完整训练!"
echo "========================================================================"