#!/bin/bash # Phase 4B RMT-PPAD Epoch 1 评估测试脚本 - 验证修复后的配置 set -e # ✅ 关键: 设置环境变量 (参考成功脚本) export PATH=/opt/conda/bin:$PATH export LD_LIBRARY_PATH=/opt/conda/lib/python3.8/site-packages/torch/lib:/opt/conda/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH export PYTHONPATH=/workspace/bevfusion:$PYTHONPATH #export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:64 # 设置GPU export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 cd /workspace/bevfusion echo "========================================================================" echo "Phase 4B RMT-PPAD Epoch 1 评估测试" echo "========================================================================" echo "目的: 验证修复后的test_pipeline是否正常工作" echo "Checkpoint: runs/run-4c8ec7e5-fabdc997/epoch_1.pth (最新)" echo "配置: multitask_BEV2X_phase4b_rmtppad_segmentation.yaml" echo "========================================================================" echo "" # 验证环境 echo "" echo "=== 环境验证 ===" /opt/conda/bin/python -c "import torch; print('✅ PyTorch:', torch.__version__)" || { echo "❌ PyTorch导入失败" exit 1 } /opt/conda/bin/python -c "import mmcv; print('✅ mmcv:', mmcv.__version__)" || { echo "❌ mmcv导入失败" exit 1 } which torchpack || { echo "❌ torchpack未找到" exit 1 } echo "✅ torchpack: $(which torchpack)" # 创建评估输出目录 EVAL_DIR="eval_test/epoch1_test_$(date +%Y%m%d_%H%M%S)" mkdir -p "$EVAL_DIR" CONFIG="runs/run-4c8ec7e5-fabdc997/configs.yaml" CHECKPOINT="runs/run-4c8ec7e5-fabdc997/epoch_1.pth" echo "配置文件: $CONFIG" echo "Checkpoint: $CHECKPOINT" echo "输出目录: $EVAL_DIR" echo "" # 检查文件存在 if [ ! -f "$CONFIG" ]; then echo "❌ 配置文件不存在: $CONFIG" exit 1 fi if [ ! -f "$CHECKPOINT" ]; then echo "❌ Checkpoint不存在: $CHECKPOINT" exit 1 fi echo "✓ 文件检查通过" echo "" # 检查GPU GPU_COUNT=$(python -c "import torch; print(torch.cuda.device_count())" 2>/dev/null || echo "0") echo "可用GPU数量: $GPU_COUNT" if [ "$GPU_COUNT" -eq 0 ]; then echo "⚠️ 没有GPU,使用CPU模式" GPU_COUNT=1 fi echo "" echo "开始评估测试..." echo "测试模式: 仅处理前10个样本 (data.samples_per_gpu=1, 总共10个batch)" echo "预计时间: 5-10分钟" echo "日志文件: $EVAL_DIR/eval_test.log" echo "" # 只测试前10个样本,验证pipeline是否工作 # 使用torchpack分布式运行 (参考训练脚本) torchpack dist-run \ -np 1 \ /opt/conda/bin/python tools/test.py \ "$CONFIG" \ "$CHECKPOINT" \ --eval bbox map \ --out "$EVAL_DIR/test_results.pkl" \ --cfg-options data.test.samples_per_gpu=1 data.workers_per_gpu=0 \ 2>&1 | tee "$EVAL_DIR/eval_test.log" | head -50 echo "" echo "========================================================================" echo "评估测试完成!" echo "========================================================================" echo "结果文件: $EVAL_DIR/test_results.pkl" echo "日志文件: $EVAL_DIR/eval_test.log" echo "" # 检查是否成功完成 if grep -q "Evaluation results" "$EVAL_DIR/eval_test.log"; then echo "✅ 评估成功!test_pipeline修复有效" echo "" echo "========================================================================" echo "关键指标 (前10个样本):" echo "========================================================================" grep -E "(NDS|mAP|mIoU|Car|Pedestrian|Divider|Divider Dice|Heatmap Loss)" "$EVAL_DIR/eval_test.log" | grep -v "UserWarning" | tail -20 else echo "❌ 评估失败,检查日志文件" echo "关键错误信息:" grep -E "(ERROR|Error|Exception|KeyError|ImportError)" "$EVAL_DIR/eval_test.log" | tail -10 exit 1 fi echo "" echo "如果测试通过,可以安全启动完整训练!" echo "========================================================================"